Merge commit 'v3.6.0' into m/master am: 4ee09a1eb9

Original change: https://android-review.googlesource.com/c/platform/external/libaom/+/2398194 Change-Id: I4a9aafd626d1b26e7e0129c021025014884c835d Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
author: James Zern <jzern@google.com> 2023-02-11 01:59:28 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2023-02-11 01:59:28 +0000
commit: bf1dc73f9c37298342c6de1c19946eed77bc9966 (patch)
tree: df63e8c8d3c192afa6e70fa85ab493f9b3ccfc81
parent: b66a35fa1eb0066c89540864ad0932f2fb01a712 (diff)
parent: 4ee09a1eb97fed5af3918f17d2ebfb0d8d0ec790 (diff)
download: libaom-bf1dc73f9c37298342c6de1c19946eed77bc9966.tar.gz
501 files changed, 40061 insertions, 37982 deletions
diff --git a/.clang-format b/.clang-format
index a37882007..a8bc4967c 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,148 +1,9 @@
 ---
 Language:        Cpp
-# BasedOnStyle:  Google
-# Generated with clang-format 7.0.1
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+BasedOnStyle:  Google
 AllowShortCaseLabelsOnASingleLine: true
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
 SortIncludes:    false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: false
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-TabWidth:        8
-UseTab:          Never
-...
-
diff --git a/.cmake-format.py b/.cmake-format.py
index 7b0e4f08d..c79a6ad60 100644
--- a/.cmake-format.py
+++ b/.cmake-format.py
@@ -64,7 +64,7 @@ enable_markup = True
 # If comment markup is enabled, don't reflow the first comment block in
 # eachlistfile. Use this to preserve formatting of your
 # copyright/licensestatements.
-first_comment_is_literal = False
+first_comment_is_literal = True
 
 # If comment markup is enabled, don't reflow any comment block which matchesthis
 # (regex) pattern. Default is `None` (disabled).
diff --git a/.mailmap b/.mailmap
index 61adddb51..7ee51a45a 100644
--- a/.mailmap
+++ b/.mailmap
@@ -42,6 +42,7 @@ John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Kyle Siefring <siekyleb@amazon.com>
 Kyle Siefring <siekyleb@amazon.com> <kylesiefring@gmail.com>
+Lin Zheng <linzhen@google.com>
 Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Logan Goldberg <logangw@google.com>
 Luc Trudeau <luc@trud.ca>
diff --git a/AUTHORS b/AUTHORS
index 0c2da8f26..9668dd39f 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -25,10 +25,13 @@ Angie Chiang <angiebird@google.com>
 Aniket Dhok <aniket.dhok@ittiam.com>
 Aniket Wanare <Aniket.wanare@ittiam.com>
 Ankur Saxena <ankurs@nvidia.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
+Apurve Pandey <apurve.pandey@ittiam.com>
 Arild Fuldseth <arilfuld@cisco.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Arun Singh Negi <arun.negi@ittiam.com>
 Attila Nagy <attilanagy@google.com>
+Balaji Anandapadmanaban <balaji.anandapadmanaban@arm.com>
 Bohan Li <bohanli@google.com>
 Brennan Shacklett <bshacklett@mozilla.com>
 Brion Vibber <bvibber@wikimedia.org>
@@ -56,6 +59,7 @@ David Turner <david.turner@argondesign.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
 Di Chen <chendixi@google.com>
+Diksha Singh <diksha.singh@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dominic Symes <dominic.symes@arm.com>
@@ -76,6 +80,7 @@ Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Fyodor Kyslov <kyslov@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
@@ -96,6 +101,7 @@ Imdad Sardharwalla <imdad.sardharwalla@argondesign.com>
 Iole Moccagatta <iole.moccagatta@gmail.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
+Ivan Rosales <rosalesi@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jack Haughton <jack.haughton@argondesign.com>
 Jacky Chen <jackychen@google.com>
@@ -124,6 +130,7 @@ John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
 Jonathan Matthews <jonathan.matthews@argondesign.com>
+Jonathan Wright <jonathan.wright@arm.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Josh Verdejo <joverdejo@google.com>
@@ -134,6 +141,7 @@ Katsuhisa Yuasa <berupon@gmail.com>
 Kavi Ramamurthy <kavii@google.com>
 KO Myung-Hun <komh@chollian.net>
 Krishna Malladi <kmalladi@google.com>
+Kwanghoon Son <kwangson@yahoo.com>
 Kyle Siefring <siekyleb@amazon.com>
 Larisa Markeeva <lmarkeeva@google.com>
 Lauren Partin <lpartin@google.com>
@@ -143,6 +151,7 @@ Lester Lu <kslu@google.com>
 liang zhao <leolzhao@tencent.com>
 Linfeng Zhang <linfengz@google.com>
 Link.Meng <monthev@gmail.com>
+Lin Zheng <linzhen@google.com>
 Logan Goldberg <logangw@google.com>
 Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Lou Quillio <louquillio@google.com>
@@ -176,8 +185,10 @@ Monty Montgomery <cmontgomery@mozilla.com>
 Morton Jonuschat <yabawock@gmail.com>
 Mudassir Galagnath <mudassir.galaganath@ittiam.com>
 Mufaddal Chakera <mufaddal.chakera@ittiam.com>
+Narayan <narayan.kalaburgi@ittiam.com>
 Nathan E. Egge <negge@mozilla.com>
 Neeraj Gadgil <neeraj.gadgil@ittiam.com>
+Neha Mary Thomas <neha.marythomas@ittiam.com>
 Neil Birkbeck <birkbeck@google.com>
 Nico Weber <thakis@chromium.org>
 Nithya V S <nithya.vs@ittiam.com>
@@ -283,7 +294,9 @@ Yannis Guyon <yguyon@google.com>
 Yaowu Xu <yaowu@google.com>
 Yeqing Wu <yeqing_wu@apple.com>
 Yi Luo <luoyi@google.com>
+Yingying Ma <yingying.ma@intel.com>
 Yongzhe Wang <yongzhe@google.com>
+Yuan Tong <tongyuan200097@gmail.com>
 Yue Chen <yuec@google.com>
 Yunqing Wang <yunqingwang@google.com>
 Yury Gitman <yuryg@google.com>
diff --git a/Android.bp b/Android.bp
index cdfe12e1f..db3247573 100644
--- a/Android.bp
+++ b/Android.bp
@@ -92,6 +92,7 @@ aom_av1_common_sources = [
     "av1/common/cdef.c",
     "av1/common/cdef_block.c",
     "av1/common/cfl.c",
+    "av1/common/common_data.c",
     "av1/common/convolve.c",
     "av1/common/debugmodes.c",
     "av1/common/entropy.c",
@@ -138,13 +139,16 @@ aom_av1_encoder_asm_ssse3_x86_64 = [
     "av1/encoder/x86/av1_quantize_ssse3_x86_64.asm",
 ]
 
+aom_av1_encoder_intrin_arm_crc32 = [
+    "av1/encoder/arm/crc32/hash_crc32.c",
+]
+
 aom_av1_encoder_intrin_avx2 = [
     "av1/encoder/x86/av1_fwd_txfm2d_avx2.c",
     "av1/encoder/x86/av1_highbd_quantize_avx2.c",
     "av1/encoder/x86/av1_k_means_avx2.c",
     "av1/encoder/x86/av1_quantize_avx2.c",
     "av1/encoder/x86/cnn_avx2.c",
-    "av1/encoder/x86/corner_match_avx2.c",
     "av1/encoder/x86/encodetxb_avx2.c",
     "av1/encoder/x86/error_intrin_avx2.c",
     "av1/encoder/x86/highbd_block_error_intrin_avx2.c",
@@ -156,12 +160,6 @@ aom_av1_encoder_intrin_avx2 = [
     "av1/encoder/x86/wedge_utils_avx2.c",
 ]
 
-aom_av1_encoder_intrin_msa = [
-    "av1/encoder/mips/msa/error_msa.c",
-    "av1/encoder/mips/msa/fdct4x4_msa.c",
-    "av1/encoder/mips/msa/temporal_filter_msa.c",
-]
-
 aom_av1_encoder_intrin_neon = [
     "av1/encoder/arm/neon/av1_error_neon.c",
     "av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c",
@@ -173,6 +171,8 @@ aom_av1_encoder_intrin_neon = [
     "av1/encoder/arm/neon/picksrt_neon.c",
     "av1/encoder/arm/neon/quantize_neon.c",
     "av1/encoder/arm/neon/rdopt_neon.c",
+    "av1/encoder/arm/neon/temporal_filter_neon.c",
+    "av1/encoder/arm/neon/wedge_utils_neon.c",
 ]
 
 aom_av1_encoder_intrin_sse2 = [
@@ -196,7 +196,6 @@ aom_av1_encoder_intrin_sse4_1 = [
     "av1/encoder/x86/av1_fwd_txfm1d_sse4.c",
     "av1/encoder/x86/av1_fwd_txfm2d_sse4.c",
     "av1/encoder/x86/av1_highbd_quantize_sse4.c",
-    "av1/encoder/x86/corner_match_sse4.c",
     "av1/encoder/x86/encodetxb_sse4.c",
     "av1/encoder/x86/highbd_fwd_txfm_sse4.c",
     "av1/encoder/x86/pickrst_sse4.c",
@@ -225,8 +224,6 @@ aom_av1_encoder_sources = [
     "av1/encoder/cnn.c",
     "av1/encoder/compound_type.c",
     "av1/encoder/context_tree.c",
-    "av1/encoder/corner_detect.c",
-    "av1/encoder/corner_match.c",
     "av1/encoder/cost.c",
     "av1/encoder/dwt.c",
     "av1/encoder/encode_strategy.c",
@@ -263,7 +260,6 @@ aom_av1_encoder_sources = [
     "av1/encoder/pickcdef.c",
     "av1/encoder/picklpf.c",
     "av1/encoder/pickrst.c",
-    "av1/encoder/ransac.c",
     "av1/encoder/ratectrl.c",
     "av1/encoder/rd.c",
     "av1/encoder/rdopt.c",
@@ -287,10 +283,10 @@ aom_av1_encoder_sources = [
 ]
 
 aom_av1_rc_qmode_sources = [
-    "av1/ducky_encode.cc",
-    "av1/ratectrl_qmode.cc",
-    "av1/ratectrl_qmode_interface.cc",
-    "av1/reference_manager.cc",
+    "av1/qmode_rc/ducky_encode.cc",
+    "av1/qmode_rc/ratectrl_qmode.cc",
+    "av1/qmode_rc/ratectrl_qmode_interface.cc",
+    "av1/qmode_rc/reference_manager.cc",
 ]
 
 aom_dsp_common_asm_sse2 = [
@@ -319,26 +315,6 @@ aom_dsp_common_intrin_avx2 = [
     "aom_dsp/x86/loopfilter_avx2.c",
 ]
 
-aom_dsp_common_intrin_dspr2 = [
-    "aom_dsp/mips/aom_convolve_copy_dspr2.c",
-    "aom_dsp/mips/common_dspr2.c",
-    "aom_dsp/mips/convolve2_dspr2.c",
-    "aom_dsp/mips/convolve2_horiz_dspr2.c",
-    "aom_dsp/mips/convolve2_vert_dspr2.c",
-    "aom_dsp/mips/convolve8_horiz_dspr2.c",
-    "aom_dsp/mips/convolve8_vert_dspr2.c",
-    "aom_dsp/mips/intrapred4_dspr2.c",
-    "aom_dsp/mips/intrapred8_dspr2.c",
-    "aom_dsp/mips/intrapred16_dspr2.c",
-]
-
-aom_dsp_common_intrin_msa = [
-    "aom_dsp/mips/aom_convolve8_horiz_msa.c",
-    "aom_dsp/mips/aom_convolve8_vert_msa.c",
-    "aom_dsp/mips/aom_convolve_copy_msa.c",
-    "aom_dsp/mips/intrapred_msa.c",
-]
-
 aom_dsp_common_intrin_neon = [
     "aom_dsp/arm/aom_convolve_copy_neon.c",
     "aom_dsp/arm/blend_a64_mask_neon.c",
@@ -421,6 +397,7 @@ aom_dsp_encoder_intrin_avx = [
 ]
 
 aom_dsp_encoder_intrin_avx2 = [
+    "aom_dsp/flow_estimation/x86/corner_match_avx2.c",
     "aom_dsp/x86/adaptive_quantize_avx2.c",
     "aom_dsp/x86/avg_intrin_avx2.c",
     "aom_dsp/x86/blk_sse_sum_avx2.c",
@@ -442,13 +419,6 @@ aom_dsp_encoder_intrin_avx2 = [
     "aom_dsp/x86/variance_impl_avx2.c",
 ]
 
-aom_dsp_encoder_intrin_msa = [
-    "aom_dsp/mips/sad_msa.c",
-    "aom_dsp/mips/sub_pixel_variance_msa.c",
-    "aom_dsp/mips/subtract_msa.c",
-    "aom_dsp/mips/variance_msa.c",
-]
-
 aom_dsp_encoder_intrin_neon = [
     "aom_dsp/arm/avg_neon.c",
     "aom_dsp/arm/hadamard_neon.c",
@@ -477,6 +447,8 @@ aom_dsp_encoder_intrin_sse2 = [
 ]
 
 aom_dsp_encoder_intrin_sse4_1 = [
+    "aom_dsp/flow_estimation/x86/corner_match_sse4.c",
+    "aom_dsp/x86/avg_intrin_sse4.c",
     "aom_dsp/x86/highbd_variance_sse4.c",
     "aom_dsp/x86/obmc_sad_sse4.c",
     "aom_dsp/x86/obmc_variance_sse4.c",
@@ -499,6 +471,11 @@ aom_dsp_encoder_sources = [
     "aom_dsp/bitwriter.c",
     "aom_dsp/blk_sse_sum.c",
     "aom_dsp/entenc.c",
+    "aom_dsp/flow_estimation/corner_detect.c",
+    "aom_dsp/flow_estimation/corner_match.c",
+    "aom_dsp/flow_estimation/disflow.c",
+    "aom_dsp/flow_estimation/flow_estimation.c",
+    "aom_dsp/flow_estimation/ransac.c",
     "aom_dsp/fwd_txfm.c",
     "aom_dsp/grain_table.c",
     "aom_dsp/noise_model.c",
@@ -540,10 +517,6 @@ aom_rtcd_sources = [
     "av1/common/av1_rtcd.c",
 ]
 
-aom_scale_intrin_dspr2 = [
-    "aom_scale/mips/dspr2/yv12extend_dspr2.c",
-]
-
 aom_scale_sources = [
     "aom_scale/generic/aom_scale.c",
     "aom_scale/generic/gen_scalers.c",
diff --git a/CHANGELOG b/CHANGELOG
index 0df6a6876..b66891a59 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,70 @@
+2023-02-03 v3.6.0
+  This release includes compression efficiency and perceptual quality
+  improvements, speedup and memory optimizations, and some new features.
+  This release is ABI compatible with the last release.
+
+  - New Features
+    * New values 20-27 (corresponding to levels 7.0-7.3 and 8.0-8.3) for
+      the encoder control AV1E_SET_TARGET_SEQ_LEVEL_IDX (note that the
+      proposal to add the new levels are still in draft status). The
+      original special value 24 (keep level stats only for level
+      monitoring) is renumbered as 32.
+    * New encoder control AV1E_SET_SKIP_POSTPROC_FILTERING to skip the
+      application of post-processing filters on reconstructed frame in
+      all intra mode.
+    * New encoder option "kf-max-pyr-height": Maximum height of pyramid
+      structure used for the GOP starting with a key frame (-1 to 5).
+    * Make SVC work for screen content.
+    * Rate control improvements to reduce frame-size spikes for screen
+      content coding.
+    * RISC-V architecture support with gcc toolchain.
+
+  - Compression Efficiency Improvements
+    * Peak compression efficiency in VOD setting is improved by 1%.
+    * 0.7% - 2.2% RTC encoding BDrate gains for real time speed 8 to 10.
+    * 15% RTC encoding BDrate gains for screen content speed 10.
+
+  - Perceptual Quality Improvements
+    * Resolved a visual quality issue that was reported for high
+      resolution clips (2K) for speed 4 and above in VOD use case.
+    * Visual quality improvements to screen content coding.
+    * Quality improvements to temporal layer RTC coding.
+
+  - Speedup and Memory Optimizations
+    * RTC single-thread encoder speedup:
+      o ~6% instruction count reduction for speed 5 and 6.
+      o ~15% instruction count reduction for speed 7.
+      o ~10% instruction count reduction for speed 8 to 10 (>=360p
+        resolutions).
+    * RTC multi-thread encoder speedup (beyond single-thread speedup):
+      o 5-8% encode time reduction for speed 7 to 10.
+    * RTC screen-content encoder speedup:
+      o 11% instruction count reduction for speed 9 and 10 (>=720p
+        resolutions).
+    * ~5% reduction in heap memory requirements for RTC, speed 6 to 10.
+    * AVIF:
+      o 4-5% speedup for speed 9 in still-picture encoding mode.
+      o 3-4% heap memory reduction in still-picture encoding mode for
+        360p-720p resolutions with multiple threads.
+
+  - Bug Fixes
+    * Added a workaround for an AV1 specification bug which makes
+      TRANSLATION type global motion models unusable.
+    * Fixed AddressSanitizer global-buffer-overflow errors in
+      av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c.
+    * Fixed AddressSanitizer heap-buffer-overflow error in
+      av1_wiener_convolve_add_src_neon().
+    * chromium:1393384 Avoid scene detection on spatial resize.
+    * aomedia:3308 Remove color artifacts under high motion.
+    * aomedia:3310 Avoid out of memory failures with Visual Studio 2017,
+      2019, and 2022 for Win32 x86 builds.
+    * aomedia:3346 Make SVC work properly for screen content.
+    * aomedia:3348 Fix a bug where an uninitialized search_site is used.
+    * aomedia:3365 Work around what seems like a Visual Studio 2022
+      compiler optimization bug.
+    * aomedia:3369 Incorrect PSNR values reported by libaom for 12-bit
+      encode.
+
 2022-08-31 v3.5.0
   This release is ABI compatible with the last one, including speedup and memory
   optimizations, and new APIs and features.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0fa444f3..8324401da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,9 +51,9 @@ endif()
 # passed to libtool.
 #
 # We set SO_FILE_VERSION = [c-a].a.r
-set(LT_CURRENT 8)
+set(LT_CURRENT 9)
 set(LT_REVISION 0)
-set(LT_AGE 5)
+set(LT_AGE 6)
 math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
 set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
 unset(LT_CURRENT)
@@ -315,15 +315,16 @@ if(CONFIG_AV1_RC_RTC AND CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
 endif()
 
 if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
-  list(
-    APPEND AOM_AV1_RC_QMODE_SOURCES "${AOM_ROOT}/av1/ratectrl_qmode_interface.h"
-           "${AOM_ROOT}/av1/ratectrl_qmode_interface.cc"
-           "${AOM_ROOT}/av1/reference_manager.h"
-           "${AOM_ROOT}/av1/reference_manager.cc"
-           "${AOM_ROOT}/av1/ratectrl_qmode.h"
-           "${AOM_ROOT}/av1/ratectrl_qmode.cc" "${AOM_ROOT}/av1/ducky_encode.h"
-           "${AOM_ROOT}/av1/ducky_encode.cc")
-  add_library(av1_rc_qmode OBJECT ${AOM_AV1_RC_QMODE_SOURCES})
+  list(APPEND AOM_AV1_RC_QMODE_SOURCES
+              "${AOM_ROOT}/av1/qmode_rc/ratectrl_qmode_interface.h"
+              "${AOM_ROOT}/av1/qmode_rc/ratectrl_qmode_interface.cc"
+              "${AOM_ROOT}/av1/qmode_rc/reference_manager.h"
+              "${AOM_ROOT}/av1/qmode_rc/reference_manager.cc"
+              "${AOM_ROOT}/av1/qmode_rc/ratectrl_qmode.h"
+              "${AOM_ROOT}/av1/qmode_rc/ratectrl_qmode.cc"
+              "${AOM_ROOT}/av1/qmode_rc/ducky_encode.h"
+              "${AOM_ROOT}/av1/qmode_rc/ducky_encode.cc")
+  add_library(av1_rc_qmode ${AOM_AV1_RC_QMODE_SOURCES})
   target_link_libraries(av1_rc_qmode ${AOM_LIB_LINK_TYPE} aom)
   if(NOT MSVC AND NOT APPLE)
     target_link_libraries(av1_rc_qmode ${AOM_LIB_LINK_TYPE} m)
diff --git a/PATENTS b/PATENTS
index 493f61637..fc4de9edf 100644
--- a/PATENTS
+++ b/PATENTS
@@ -53,7 +53,7 @@ Alliance for Open Media Patent License 1.0
      INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
      ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
      OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
-     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     NOT THE OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 2. Definitions.
 
diff --git a/README.android b/README.android
index 56e520764..c849be065 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
 Name: libaom
 URL: https://aomedia.org
-Version: v3.5.0
+Version: v3.6.0
 License: BSD
 License File: libaom/LICENSE
 
-Date: Thursday September 22 2022
-Branch: gladmane
-Commit: bcfe6fbfed315f83ee8a95465c654ee8078dbff9
+Date: Tuesday February 07 2023
+Branch: helia
+Commit: 3c65175b1972da4a1992c1dae2365b48d13f9a8d
 
 Description:
 Contains the sources used to compile libaom.
diff --git a/README.md b/README.md
index 0146003db..0d51080d9 100644
--- a/README.md
+++ b/README.md
@@ -19,16 +19,16 @@ README.md                {#LREADME}
     - [Build with VMAF support](#build-with-vmaf)
 2. [Testing the library](#testing-the-av1-codec)
     - [Basics](#testing-basics)
-        - [Unit tests](#1_unit-tests)
-        - [Example tests](#2_example-tests)
-        - [Encoder tests](#3_encoder-tests)
+        - [Unit tests](#unit-tests)
+        - [Example tests](#example-tests)
+        - [Encoder tests](#encoder-tests)
     - [IDE hosted tests](#ide-hosted-tests)
     - [Downloading test data](#downloading-the-test-data)
     - [Adding a new test data file](#adding-a-new-test-data-file)
     - [Additional test data](#additional-test-data)
     - [Sharded testing](#sharded-testing)
-        - [Running tests directly](#1_running-test_libaom-directly)
-        - [Running tests via CMake](#2_running-the-tests-via-the-cmake-build)
+        - [Running tests directly](#running-test_libaom-directly)
+        - [Running tests via CMake](#running-the-tests-via-the-cmake-build)
 3. [Coding style](#coding-style)
 4. [Submitting patches](#submitting-patches)
     - [Login cookie](#login-cookie)
@@ -165,8 +165,8 @@ The toolchain files available at the time of this writing are:
  - armv7-linux-gcc.cmake
  - armv7-mingw-gcc.cmake
  - armv7s-ios.cmake
- - mips32-linux-gcc.cmake
- - mips64-linux-gcc.cmake
+ - ppc-linux-gcc.cmake
+ - riscv-linux-gcc.cmake
  - x86-ios-simulator.cmake
  - x86-linux.cmake
  - x86-macos.cmake
@@ -341,7 +341,7 @@ There are several methods of testing the AV1 codec. All of these methods require
 the presence of the AV1 source code and a working build of the AV1 library and
 applications.
 
-#### 1. Unit tests: {#1_unit-tests}
+#### 1. Unit tests: {#unit-tests}
 
 The unit tests can be run at build time:
 
@@ -355,7 +355,7 @@ The unit tests can be run at build time:
     $ make runtests
 ~~~
 
-#### 2. Example tests: {#2_example-tests}
+#### 2. Example tests: {#example-tests}
 
 The example tests require a bash shell and can be run in the following manner:
 
@@ -370,7 +370,7 @@ The example tests require a bash shell and can be run in the following manner:
     $ path/to/aom/test/examples.sh --bin-path examples
 ~~~
 
-#### 3. Encoder tests: {#3_encoder-tests}
+#### 3. Encoder tests: {#encoder-tests}
 
 When making a change to the encoder run encoder tests to confirm that your
 change has a positive or negligible impact on encode quality. When running these
@@ -487,7 +487,7 @@ https://media.xiph.org/video/derf/
 The AV1 codec library unit tests are built upon gtest which supports sharding of
 test jobs. Sharded test runs can be achieved in a couple of ways.
 
-#### 1. Running test\_libaom directly: {#1_running-test_libaom-directly}
+#### 1. Running test\_libaom directly: {#running-test_libaom-directly}
 
 ~~~
    # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
@@ -501,7 +501,7 @@ test jobs. Sharded test runs can be achieved in a couple of ways.
 To create a test shard for each CPU core available on the current system set
 `GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one.
 
-#### 2. Running the tests via the CMake build: {#2_running-the-tests-via-the-cmake-build}
+#### 2. Running the tests via the CMake build: {#running-the-tests-via-the-cmake-build}
 
 ~~~
     # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See
diff --git a/README.version b/README.version
index 9bbe9461c..c6dd396d2 100644
--- a/README.version
+++ b/README.version
@@ -1,4 +1,4 @@
-URL: https://aomedia.googlesource.com/aom/+archive/bcfe6fbfed315f83ee8a95465c654ee8078dbff9.tar.gz
-Version: v3.5.0
+URL: https://aomedia.googlesource.com/aom/
+Version: v3.6.0
 Local Modifications:
 None
diff --git a/aom/aom_codec.h b/aom/aom_codec.h
index 49d48cf15..6a9fb7bb1 100644
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h
@@ -14,7 +14,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 //
 // There are two levels of interfaces used to access the AOM codec: the
-// the aom_codec_iface and the aom_codec_ctx.
+// aom_codec_iface and the aom_codec_ctx.
 //
 // 1. aom_codec_iface_t
 //    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
@@ -244,7 +244,7 @@ typedef int64_t aom_codec_pts_t;
  *   - aom_codec_get_caps(aom_codec_iface_t *iface): returns
  *     the capabilities of the codec
  *   - aom_codec_enc_config_default: generate the default config for
- *     initializing the encoder (see documention in aom_encoder.h)
+ *     initializing the encoder (see documentation in aom_encoder.h)
  *   - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context
  *     structure (see documentation on aom_codec_ctx).
  *
@@ -268,18 +268,18 @@ typedef struct aom_codec_priv aom_codec_priv_t;
  * support frame types that are codec specific (MPEG-1 D-frames for example)
  */
 typedef uint32_t aom_codec_frame_flags_t;
-#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+#define AOM_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
  * depends on this one) */
-#define AOM_FRAME_IS_DROPPABLE 0x2
+#define AOM_FRAME_IS_DROPPABLE 0x2u
 /*!\brief this is an INTRA_ONLY frame */
-#define AOM_FRAME_IS_INTRAONLY 0x10
+#define AOM_FRAME_IS_INTRAONLY 0x10u
 /*!\brief this is an S-frame */
-#define AOM_FRAME_IS_SWITCH 0x20
+#define AOM_FRAME_IS_SWITCH 0x20u
 /*!\brief this is an error-resilient frame */
-#define AOM_FRAME_IS_ERROR_RESILIENT 0x40
+#define AOM_FRAME_IS_ERROR_RESILIENT 0x40u
 /*!\brief this is a key-frame dependent recovery-point frame */
-#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80
+#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80u
 
 /*!\brief Iterator
  *
@@ -365,7 +365,7 @@ int aom_codec_version(void);
  *
  * Returns a printable string containing the full library version number. This
  * may contain additional text following the three digit version number, as to
- * indicate release candidates, prerelease versions, etc.
+ * indicate release candidates, pre-release versions, etc.
  *
  */
 const char *aom_codec_version_str(void);
@@ -521,7 +521,7 @@ aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name,
 #define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \
   aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
 
-/*!\brief Creates typechecking mechanisms for aom_codec_control
+/*!\brief Creates type checking mechanisms for aom_codec_control
  *
  * It defines a static function with the correctly typed arguments as a wrapper
  * to the type-unsafe aom_codec_control function. It also creates a typedef
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index 7009018e3..c0efe79d6 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -76,8 +76,7 @@ extern "C" {
  *
  *  The available flags are specified by AOM_CODEC_USE_* defines.
  */
-#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
-/*!\brief Make the encoder output one  partition at a time. */
+#define AOM_CODEC_USE_PSNR 0x10000         /**< Calculate PSNR on each frame */
 #define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 
 /*!\brief Generic fixed size buffer structure
@@ -435,6 +434,11 @@ typedef struct aom_codec_enc_cfg {
 
   /*!\brief Max number of frames to encode
    *
+   * If force video mode is off (the default) and g_limit is 1, the encoder
+   * will encode a still picture (still_picture is set to 1 in the sequence
+   * header OBU). If in addition full_still_picture_hdr is 0 (the default),
+   * the encoder will use a reduced header (reduced_still_picture_header is
+   * set to 1 in the sequence header OBU) for the still picture.
    */
   unsigned int g_limit;
 
@@ -817,10 +821,12 @@ typedef struct aom_codec_enc_cfg {
 
   /*!\brief full_still_picture_hdr
    *
-   * If this is nonzero, the encoder will generate a full header even for
-   * still picture encoding. if zero, a reduced header is used for still
-   * picture. This flag has no effect when a regular video with more than
-   * a single frame is encoded.
+   * If this is nonzero, the encoder will generate a full header
+   * (reduced_still_picture_header is set to 0 in the sequence header OBU) even
+   * for still picture encoding. If this is zero (the default), a reduced
+   * header (reduced_still_picture_header is set to 1 in the sequence header
+   * OBU) is used for still picture encoding. This flag has no effect when a
+   * regular video with more than a single frame is encoded.
    */
   unsigned int full_still_picture_hdr;
 
diff --git a/aom/aom_external_partition.h b/aom/aom_external_partition.h
index 55c59a574..c381f6e5e 100644
--- a/aom/aom_external_partition.h
+++ b/aom/aom_external_partition.h
@@ -246,7 +246,7 @@ typedef struct aom_partition_features {
   int block_size;                 ///< As "BLOCK_SIZE" in av1/common/enums.h
   /*!
    * Valid partition types. A bitmask is used.  "1" represents the
-   * corresponding type is vaild. The bitmask follows the enum order for
+   * corresponding type is valid. The bitmask follows the enum order for
    * PARTITION_TYPE in "enums.h" to represent one partition type at a bit.
    * For example, 0x01 stands for only PARTITION_NONE is valid,
    * 0x09 (00...001001) stands for PARTITION_NONE and PARTITION_SPLIT are valid.
@@ -313,10 +313,10 @@ typedef struct aom_partition_decision {
   int do_rectangular_split;        ///< Try rectangular split partition
   int do_square_split;             ///< Try square split partition
   int prune_rect_part[2];          ///< Prune rectangular partition
-  int horza_partition_allowed;     ///< Allow HORZ_A partitioin
-  int horzb_partition_allowed;     ///< Allow HORZ_B partitioin
-  int verta_partition_allowed;     ///< Allow VERT_A partitioin
-  int vertb_partition_allowed;     ///< Allow VERT_B partitioin
+  int horza_partition_allowed;     ///< Allow HORZ_A partition
+  int horzb_partition_allowed;     ///< Allow HORZ_B partition
+  int verta_partition_allowed;     ///< Allow VERT_A partition
+  int vertb_partition_allowed;     ///< Allow VERT_B partition
   int partition_horz4_allowed;     ///< Allow HORZ4 partition
   int partition_vert4_allowed;     ///< Allow VERT4 partition
 } aom_partition_decision_t;
@@ -326,7 +326,7 @@ typedef struct aom_partition_decision {
  * The encoding stats collected by encoding the superblock with the
  * given partition types.
  * The encoder sends the stats to the external model for training
- * or inference though "func()" defined in ....
+ * or inference through "func()" defined in ....
  */
 typedef struct aom_partition_stats {
   int rate;        ///< Rate cost of the block
diff --git a/aom/aomcx.h b/aom/aomcx.h
index 8f129369b..b7d1d9d4b 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -191,12 +191,12 @@ enum aome_enc_control_id {
 
   /* NOTE: enum 10 unused */
 
-  /*!\brief Codec control function to set encoder scaling mode,
-   * aom_scaling_mode_t* parameter.
+  /*!\brief Codec control function to set encoder scaling mode for the next
+   * frame to be coded, aom_scaling_mode_t* parameter.
    */
   AOME_SET_SCALEMODE = 11,
 
-  /*!\brief Codec control function to set encoder spatial layer id, unsigned int
+  /*!\brief Codec control function to set encoder spatial layer id, int
    * parameter.
    */
   AOME_SET_SPATIAL_LAYER_ID = 12,
@@ -432,7 +432,7 @@ enum aome_enc_control_id {
 
   /*!\brief Codec control function to enable error_resilient_mode, int parameter
    *
-   * AV1 has a bitstream feature to guarantee parseability of a frame
+   * AV1 has a bitstream feature to guarantee parsability of a frame
    * by turning on the error_resilient_decoding mode, even though the
    * reference buffers are unreliable or not received.
    *
@@ -616,18 +616,19 @@ enum aome_enc_control_id {
    * point (OP), int parameter
    * Possible values are in the form of "ABxy".
    *  - AB: OP index.
-   *  - xy: Target level index for the OP. Can be values 0~23 (corresponding to
-   *    level 2.0 ~ 7.3, note levels 2.2, 2.3, 3.2, 3.3, 4.2, 4.3, 7.0, 7.1, 7.2
-   *    & 7.3 are undefined) or 24 (keep level stats only for level monitoring)
-   *    or 31 (maximum level parameter, no level-based constraints).
+   *  - xy: Target level index for the OP. Can be values 0~27 (corresponding to
+   *    level 2.0 ~ 8.3, note levels 2.2, 2.3, 3.2, 3.3, 4.2 & 4.3 are
+   *    undefined, and that levels 7.x and 8.x are in draft status), 31
+   *    (maximum parameters level, no level-based constraints) or 32 (keep
+   *    level stats only for level monitoring).
    *
    * E.g.:
    * - "0" means target level index 0 (2.0) for the 0th OP;
    * - "109" means target level index 9 (4.1) for the 1st OP;
    * - "1019" means target level index 19 (6.3) for the 10th OP.
    *
-   * If the target level is not specified for an OP, the maximum level parameter
-   * of 31 is used as default.
+   * If the target level is not specified for an OP, the maximum parameters
+   * level of 31 is used as default.
    */
   AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54,
 
@@ -1271,7 +1272,7 @@ enum aome_enc_control_id {
    */
   AV1E_SET_SVC_LAYER_ID = 131,
 
-  /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t*
+  /*!\brief Codec control function to set SVC parameters, aom_svc_params_t*
    * parameter
    */
   AV1E_SET_SVC_PARAMS = 132,
@@ -1455,6 +1456,28 @@ enum aome_enc_control_id {
    */
   AV1E_GET_NUM_OPERATING_POINTS = 156,
 
+  /*!\brief Codec control function to skip the application of post-processing
+   * filters on reconstructed frame, unsigned int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   *
+   * \attention For this value to be used aom_codec_enc_cfg_t::g_usage
+   *            must be set to AOM_USAGE_ALL_INTRA.
+   */
+  AV1E_SET_SKIP_POSTPROC_FILTERING = 157,
+
+  /*!\brief Codec control function to enable the superblock level
+   * qp sweep in AV1 to ensure that end-to-end test runs well,
+   * unsigned int parameter.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   *
+   * \note This is only used in sb_qp_sweep unit test.
+   */
+  AV1E_ENABLE_SB_QP_SWEEP = 158,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -1637,7 +1660,7 @@ AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
 AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
 #define AOM_CTRL_AOME_SET_SCALEMODE
 
-AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, unsigned int)
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int)
 #define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
 
 AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
@@ -2071,6 +2094,12 @@ AOM_CTRL_USE_TYPE(AV1E_GET_TARGET_SEQ_LEVEL_IDX, int *)
 AOM_CTRL_USE_TYPE(AV1E_GET_NUM_OPERATING_POINTS, int *)
 #define AOM_CTRL_AV1E_GET_NUM_OPERATING_POINTS
 
+AOM_CTRL_USE_TYPE(AV1E_SET_SKIP_POSTPROC_FILTERING, unsigned int)
+#define AOM_CTRL_AV1E_SET_SKIP_POSTPROC_FILTERING
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_QP_SWEEP, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_SB_QP_SWEEP
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 7e2f57024..c5c2db7b3 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -103,7 +103,12 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")
+            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
@@ -114,29 +119,6 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
 
-list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
-            "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_MSA
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
-
 if(CONFIG_AV1_HIGHBITDEPTH)
   list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
               "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
@@ -192,6 +174,22 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/variance.c"
               "${AOM_ROOT}/aom_dsp/variance.h")
 
+  # Flow estimation library
+  if(NOT CONFIG_REALTIME_ONLY)
+    list(APPEND AOM_DSP_ENCODER_SOURCES
+                "${AOM_ROOT}/aom_dsp/flow_estimation/corner_detect.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/corner_match.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/disflow.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/flow_estimation.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/ransac.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+                "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_sse4.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+  endif()
+
   list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
@@ -250,7 +248,9 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
 
-  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
@@ -263,11 +263,6 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
 
-  list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
-              "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
-              "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
-              "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
-
   if(CONFIG_AV1_HIGHBITDEPTH)
     list(APPEND AOM_DSP_ENCODER_ASM_SSE2
                 "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
@@ -426,20 +421,6 @@ function(setup_aom_dsp_targets)
     endif()
   endif()
 
-  if(HAVE_DSPR2)
-    add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_DSPR2")
-  endif()
-
-  if(HAVE_MSA)
-    add_intrinsics_object_library("" "msa" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_MSA")
-    if(CONFIG_AV1_ENCODER)
-      add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_MSA")
-    endif()
-  endif()
-
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
   if(BUILD_SHARED_LIBS)
     target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 4503c8eb0..b3f8ec7de 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -85,101 +85,101 @@ foreach (@tx_sizes) {
   }
 }
 
-specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_top_predictor_4x4 neon sse2/;
 specialize qw/aom_dc_top_predictor_4x8 sse2/;
 specialize qw/aom_dc_top_predictor_4x16 sse2/;
 specialize qw/aom_dc_top_predictor_8x4 sse2/;
-specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon sse2/;
 specialize qw/aom_dc_top_predictor_8x16 sse2/;
 specialize qw/aom_dc_top_predictor_8x32 sse2/;
 specialize qw/aom_dc_top_predictor_16x4 sse2/;
 specialize qw/aom_dc_top_predictor_16x8 sse2/;
-specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon sse2/;
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
 specialize qw/aom_dc_top_predictor_16x64 sse2/;
 specialize qw/aom_dc_top_predictor_32x8 sse2/;
 specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
 
-specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_4x4 neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
 specialize qw/aom_dc_left_predictor_4x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x4 sse2/;
-specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon sse2/;
 specialize qw/aom_dc_left_predictor_8x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x32 sse2/;
 specialize qw/aom_dc_left_predictor_16x4 sse2/;
 specialize qw/aom_dc_left_predictor_16x8 sse2/;
-specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon sse2/;
 specialize qw/aom_dc_left_predictor_16x32 sse2/;
 specialize qw/aom_dc_left_predictor_16x64 sse2/;
 specialize qw/aom_dc_left_predictor_32x8 sse2/;
 specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
 
-specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_4x4 neon sse2/;
 specialize qw/aom_dc_128_predictor_4x8 sse2/;
 specialize qw/aom_dc_128_predictor_4x16 sse2/;
 specialize qw/aom_dc_128_predictor_8x4 sse2/;
-specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon sse2/;
 specialize qw/aom_dc_128_predictor_8x16 sse2/;
 specialize qw/aom_dc_128_predictor_8x32 sse2/;
 specialize qw/aom_dc_128_predictor_16x4 sse2/;
 specialize qw/aom_dc_128_predictor_16x8 sse2/;
-specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon sse2/;
 specialize qw/aom_dc_128_predictor_16x32 sse2/;
 specialize qw/aom_dc_128_predictor_16x64 sse2/;
 specialize qw/aom_dc_128_predictor_32x8 sse2/;
 specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
 
-specialize qw/aom_v_predictor_4x4 neon msa sse2/;
+specialize qw/aom_v_predictor_4x4 neon sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
 specialize qw/aom_v_predictor_4x16 sse2/;
 specialize qw/aom_v_predictor_8x4 sse2/;
-specialize qw/aom_v_predictor_8x8 neon msa sse2/;
+specialize qw/aom_v_predictor_8x8 neon sse2/;
 specialize qw/aom_v_predictor_8x16 sse2/;
 specialize qw/aom_v_predictor_8x32 sse2/;
 specialize qw/aom_v_predictor_16x4 sse2/;
 specialize qw/aom_v_predictor_16x8 sse2/;
-specialize qw/aom_v_predictor_16x16 neon msa sse2/;
+specialize qw/aom_v_predictor_16x16 neon sse2/;
 specialize qw/aom_v_predictor_16x32 sse2/;
 specialize qw/aom_v_predictor_16x64 sse2/;
 specialize qw/aom_v_predictor_32x8 sse2/;
 specialize qw/aom_v_predictor_32x16 sse2 avx2/;
-specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_v_predictor_32x64 sse2 avx2/;
 specialize qw/aom_v_predictor_64x16 sse2 avx2/;
 specialize qw/aom_v_predictor_64x32 sse2 avx2/;
 specialize qw/aom_v_predictor_64x64 sse2 avx2/;
 
-specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_4x4 neon sse2/;
 specialize qw/aom_h_predictor_4x8 sse2/;
 specialize qw/aom_h_predictor_4x16 sse2/;
 specialize qw/aom_h_predictor_8x4 sse2/;
-specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_8x8 neon sse2/;
 specialize qw/aom_h_predictor_8x16 sse2/;
 specialize qw/aom_h_predictor_8x32 sse2/;
 specialize qw/aom_h_predictor_16x4 sse2/;
 specialize qw/aom_h_predictor_16x8 sse2/;
-specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_16x16 neon sse2/;
 specialize qw/aom_h_predictor_16x32 sse2/;
 specialize qw/aom_h_predictor_16x64 sse2/;
 specialize qw/aom_h_predictor_32x8 sse2/;
 specialize qw/aom_h_predictor_32x16 sse2/;
-specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_h_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_h_predictor_32x64 sse2/;
 specialize qw/aom_h_predictor_64x16 sse2/;
 specialize qw/aom_h_predictor_64x32 sse2/;
@@ -267,21 +267,21 @@ specialize qw/aom_smooth_h_predictor_64x64 neon ssse3/;
 
 # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
 # by multiply and shift.
-specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
+specialize qw/aom_dc_predictor_4x4 neon sse2/;
 specialize qw/aom_dc_predictor_4x8 sse2/;
 specialize qw/aom_dc_predictor_4x16 sse2/;
 specialize qw/aom_dc_predictor_8x4 sse2/;
-specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/aom_dc_predictor_8x8 neon sse2/;
 specialize qw/aom_dc_predictor_8x16 sse2/;
 specialize qw/aom_dc_predictor_8x32 sse2/;
 specialize qw/aom_dc_predictor_16x4 sse2/;
 specialize qw/aom_dc_predictor_16x8 sse2/;
-specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
+specialize qw/aom_dc_predictor_16x16 neon sse2/;
 specialize qw/aom_dc_predictor_16x32 sse2/;
 specialize qw/aom_dc_predictor_16x64 sse2/;
 specialize qw/aom_dc_predictor_32x8 sse2/;
 specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
@@ -450,7 +450,7 @@ add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-specialize qw/aom_convolve_copy       neon dspr2 msa sse2 avx2/;
+specialize qw/aom_convolve_copy       neon sse2 avx2/;
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
 
@@ -478,7 +478,7 @@ add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8
 specialize qw/aom_lpf_vertical_14_dual sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
-specialize qw/aom_lpf_vertical_14_quad sse2 neon/;
+specialize qw/aom_lpf_vertical_14_quad avx2 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_vertical_6 sse2 neon/;
@@ -719,7 +719,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # Block subtraction
   #
   add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-  specialize qw/aom_subtract_block neon msa sse2 avx2/;
+  specialize qw/aom_subtract_block neon sse2 avx2/;
 
   add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
   specialize qw/aom_sse  sse4_1 avx2 neon/;
@@ -764,30 +764,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   }
 
   add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
-  specialize qw/aom_sum_sse_2d_i16 sse2 avx2/;
-  specialize qw/aom_sad128x128    avx2 neon     sse2/;
-  specialize qw/aom_sad128x64     avx2          sse2/;
-  specialize qw/aom_sad64x128     avx2          sse2/;
-  specialize qw/aom_sad64x64      avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32      avx2      msa sse2/;
-  specialize qw/aom_sad32x64      avx2      msa sse2/;
-  specialize qw/aom_sad32x32      avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16      avx2      msa sse2/;
-  specialize qw/aom_sad16x32                msa sse2/;
-  specialize qw/aom_sad16x16           neon msa sse2/;
-  specialize qw/aom_sad16x8            neon msa sse2/;
-  specialize qw/aom_sad8x16            neon msa sse2/;
-  specialize qw/aom_sad8x8             neon msa sse2/;
-  specialize qw/aom_sad8x4                  msa sse2/;
-  specialize qw/aom_sad4x8                  msa sse2/;
-  specialize qw/aom_sad4x4             neon msa sse2/;
-
-  specialize qw/aom_sad4x16                     sse2/;
-  specialize qw/aom_sad16x4                     sse2/;
-  specialize qw/aom_sad8x32                     sse2/;
-  specialize qw/aom_sad32x8                     sse2/;
-  specialize qw/aom_sad16x64                    sse2/;
-  specialize qw/aom_sad64x16                    sse2/;
+  specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/;
+  specialize qw/aom_sad128x128    avx2 neon sse2/;
+  specialize qw/aom_sad128x64     avx2 neon sse2/;
+  specialize qw/aom_sad64x128     avx2 neon sse2/;
+  specialize qw/aom_sad64x64      avx2 neon sse2/;
+  specialize qw/aom_sad64x32      avx2 neon sse2/;
+  specialize qw/aom_sad32x64      avx2 neon sse2/;
+  specialize qw/aom_sad32x32      avx2 neon sse2/;
+  specialize qw/aom_sad32x16      avx2 neon sse2/;
+  specialize qw/aom_sad16x32           neon sse2/;
+  specialize qw/aom_sad16x16           neon sse2/;
+  specialize qw/aom_sad16x8            neon sse2/;
+  specialize qw/aom_sad8x16            neon sse2/;
+  specialize qw/aom_sad8x8             neon sse2/;
+  specialize qw/aom_sad8x4             neon sse2/;
+  specialize qw/aom_sad4x8             neon sse2/;
+  specialize qw/aom_sad4x4             neon sse2/;
+
+  specialize qw/aom_sad4x16            neon sse2/;
+  specialize qw/aom_sad16x4            neon sse2/;
+  specialize qw/aom_sad8x32            neon sse2/;
+  specialize qw/aom_sad32x8            neon sse2/;
+  specialize qw/aom_sad16x64           neon sse2/;
+  specialize qw/aom_sad64x16           neon sse2/;
 
   specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
   specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
@@ -810,29 +810,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad_skip_16x64                    sse2  neon/;
   specialize qw/aom_sad_skip_64x16                    sse2  neon/;
 
-  specialize qw/aom_sad128x128_avg avx2     sse2/;
-  specialize qw/aom_sad128x64_avg  avx2     sse2/;
-  specialize qw/aom_sad64x128_avg  avx2     sse2/;
-  specialize qw/aom_sad64x64_avg   avx2 msa sse2/;
-  specialize qw/aom_sad64x32_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x64_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x32_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x16_avg   avx2 msa sse2/;
-  specialize qw/aom_sad16x32_avg        msa sse2/;
-  specialize qw/aom_sad16x16_avg        msa sse2/;
-  specialize qw/aom_sad16x8_avg         msa sse2/;
-  specialize qw/aom_sad8x16_avg         msa sse2/;
-  specialize qw/aom_sad8x8_avg          msa sse2/;
-  specialize qw/aom_sad8x4_avg          msa sse2/;
-  specialize qw/aom_sad4x8_avg          msa sse2/;
-  specialize qw/aom_sad4x4_avg          msa sse2/;
-
-  specialize qw/aom_sad4x16_avg             sse2/;
-  specialize qw/aom_sad16x4_avg             sse2/;
-  specialize qw/aom_sad8x32_avg             sse2/;
-  specialize qw/aom_sad32x8_avg             sse2/;
-  specialize qw/aom_sad16x64_avg            sse2/;
-  specialize qw/aom_sad64x16_avg            sse2/;
+  specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
+  specialize qw/aom_sad128x64_avg  avx2 sse2 neon/;
+  specialize qw/aom_sad64x128_avg  avx2 sse2 neon/;
+  specialize qw/aom_sad64x64_avg   avx2 sse2 neon/;
+  specialize qw/aom_sad64x32_avg   avx2 sse2 neon/;
+  specialize qw/aom_sad32x64_avg   avx2 sse2 neon/;
+  specialize qw/aom_sad32x32_avg   avx2 sse2 neon/;
+  specialize qw/aom_sad32x16_avg   avx2 sse2 neon/;
+  specialize qw/aom_sad16x32_avg        sse2 neon/;
+  specialize qw/aom_sad16x16_avg        sse2 neon/;
+  specialize qw/aom_sad16x8_avg         sse2 neon/;
+  specialize qw/aom_sad8x16_avg         sse2 neon/;
+  specialize qw/aom_sad8x8_avg          sse2 neon/;
+  specialize qw/aom_sad8x4_avg          sse2 neon/;
+  specialize qw/aom_sad4x8_avg          sse2 neon/;
+  specialize qw/aom_sad4x4_avg          sse2 neon/;
+
+  specialize qw/aom_sad4x16_avg         sse2 neon/;
+  specialize qw/aom_sad16x4_avg         sse2 neon/;
+  specialize qw/aom_sad8x32_avg         sse2 neon/;
+  specialize qw/aom_sad32x8_avg         sse2 neon/;
+  specialize qw/aom_sad16x64_avg        sse2 neon/;
+  specialize qw/aom_sad64x16_avg        sse2 neon/;
 
   specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
   specialize qw/aom_dist_wtd_sad128x64_avg  ssse3/;
@@ -997,39 +997,37 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
   }
 
-  specialize qw/aom_sad128x128x4d avx2          sse2/;
-  specialize qw/aom_sad128x64x4d  avx2          sse2/;
-  specialize qw/aom_sad64x128x4d  avx2          sse2/;
-  specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
-  specialize qw/aom_sad64x16x4d   avx2          sse2/;
-  specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x8x4d    avx2          sse2/;
-  specialize qw/aom_sad16x64x4d                 sse2/;
-  specialize qw/aom_sad16x32x4d             msa sse2/;
-  specialize qw/aom_sad16x16x4d         neon msa sse2/;
-  specialize qw/aom_sad16x8x4d               msa sse2/;
-
-  specialize qw/aom_sad8x16x4d              msa sse2/;
-  specialize qw/aom_sad8x8x4d               msa sse2/;
-  specialize qw/aom_sad8x4x4d               msa sse2/;
-  specialize qw/aom_sad4x16x4d              msa sse2/;
-  specialize qw/aom_sad4x8x4d               msa sse2/;
-  specialize qw/aom_sad4x4x4d               msa sse2/;
-
-  specialize qw/aom_sad4x32x4d  sse2/;
-  specialize qw/aom_sad4x16x4d  sse2/;
-  specialize qw/aom_sad16x4x4d  sse2/;
-  specialize qw/aom_sad8x32x4d  sse2/;
-  specialize qw/aom_sad32x8x4d  sse2/;
-  specialize qw/aom_sad64x16x4d sse2/;
+  specialize qw/aom_sad128x128x4d avx2 neon sse2/;
+  specialize qw/aom_sad128x64x4d  avx2 neon sse2/;
+  specialize qw/aom_sad64x128x4d  avx2 neon sse2/;
+  specialize qw/aom_sad64x64x4d   avx2 neon sse2/;
+  specialize qw/aom_sad64x32x4d   avx2 neon sse2/;
+  specialize qw/aom_sad32x64x4d   avx2 neon sse2/;
+  specialize qw/aom_sad32x32x4d   avx2 neon sse2/;
+  specialize qw/aom_sad32x16x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x32x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x16x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x8x4d    avx2 neon sse2/;
+
+  specialize qw/aom_sad8x16x4d         neon sse2/;
+  specialize qw/aom_sad8x8x4d          neon sse2/;
+  specialize qw/aom_sad8x4x4d          neon sse2/;
+  specialize qw/aom_sad4x32x4d         neon sse2/;
+  specialize qw/aom_sad4x8x4d          neon sse2/;
+  specialize qw/aom_sad4x4x4d          neon sse2/;
+
+  specialize qw/aom_sad64x16x4d   avx2 neon sse2/;
+  specialize qw/aom_sad32x8x4d    avx2 neon sse2/;
+  specialize qw/aom_sad16x64x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x4x4d    avx2 neon sse2/;
+  specialize qw/aom_sad8x32x4d         neon sse2/;
+  specialize qw/aom_sad4x16x4d         neon sse2/;
 
   specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
   specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
@@ -1042,19 +1040,32 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
   specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
 
-  specialize qw/aom_sad_skip_16x64x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x32x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x16x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
-  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
   specialize qw/aom_sad_skip_4x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_32x8x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_64x16x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
+
+  specialize qw/aom_sad128x128x3d avx2/;
+  specialize qw/aom_sad128x64x3d  avx2/;
+  specialize qw/aom_sad64x128x3d  avx2/;
+  specialize qw/aom_sad64x64x3d   avx2/;
+  specialize qw/aom_sad64x32x3d   avx2/;
+  specialize qw/aom_sad32x64x3d   avx2/;
+  specialize qw/aom_sad32x32x3d   avx2/;
+  specialize qw/aom_sad32x16x3d   avx2/;
+  specialize qw/aom_sad16x32x3d   avx2/;
+  specialize qw/aom_sad16x16x3d   avx2/;
+  specialize qw/aom_sad16x8x3d    avx2/;
+
+  specialize qw/aom_sad64x16x3d   avx2/;
+  specialize qw/aom_sad32x8x3d    avx2/;
+  specialize qw/aom_sad16x64x3d   avx2/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     specialize qw/aom_sad128x128x4d_avg sse2/;
@@ -1122,6 +1133,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
       add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
       if ($w != 128 && $h != 128) {
         specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
@@ -1171,6 +1183,23 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_highbd_sad_skip_32x8x4d    avx2 sse2/;
     specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2/;
     specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2/;
+
+    specialize qw/aom_highbd_sad128x128x3d avx2/;
+    specialize qw/aom_highbd_sad128x64x3d  avx2/;
+    specialize qw/aom_highbd_sad64x128x3d  avx2/;
+    specialize qw/aom_highbd_sad64x64x3d   avx2/;
+    specialize qw/aom_highbd_sad64x32x3d   avx2/;
+    specialize qw/aom_highbd_sad32x64x3d   avx2/;
+    specialize qw/aom_highbd_sad32x32x3d   avx2/;
+    specialize qw/aom_highbd_sad32x16x3d   avx2/;
+    specialize qw/aom_highbd_sad16x32x3d   avx2/;
+    specialize qw/aom_highbd_sad16x16x3d   avx2/;
+    specialize qw/aom_highbd_sad16x8x3d    avx2/;
+
+    specialize qw/aom_highbd_sad16x4x3d    avx2/;
+    specialize qw/aom_highbd_sad32x8x3d    avx2/;
+    specialize qw/aom_highbd_sad16x64x3d   avx2/;
+    specialize qw/aom_highbd_sad64x16x3d   avx2/;
   }
   #
   # Avg
@@ -1194,14 +1223,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   }
 
-  add_proto qw/void aom_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/aom_int_pro_row sse2 neon/;
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+  specialize qw/aom_int_pro_row avx2 sse2 neon/;
 
-  add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/aom_int_pro_col sse2 neon/;
+  add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+  specialize qw/aom_int_pro_col avx2 sse2 neon/;
 
-  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
-  specialize qw/aom_vector_var neon/;
+  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl";
+  specialize qw/aom_vector_var avx2 sse4_1 neon/;
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
   #specialize qw/aom_vector_var neon sse2/;
 
@@ -1273,21 +1302,24 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
   add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-  specialize qw/aom_get16x16var                neon msa/;
-  specialize qw/aom_get8x8var             sse2 neon msa/;
+  specialize qw/aom_get16x16var                neon/;
+  specialize qw/aom_get8x8var             sse2 neon/;
+
+  add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
+  specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon/;
 
-  add_proto qw/void aom_get_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/aom_get_sse_sum_8x8_quad        avx2 sse2 neon/;
+  add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
+  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2/;
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-  specialize qw/aom_mse16x16          sse2 avx2 neon msa/;
-  specialize qw/aom_mse16x8           sse2           msa/;
-  specialize qw/aom_mse8x16           sse2           msa/;
-  specialize qw/aom_mse8x8            sse2           msa/;
+  specialize qw/aom_mse16x16          sse2 avx2 neon/;
+  specialize qw/aom_mse16x8           sse2      neon/;
+  specialize qw/aom_mse8x16           sse2      neon/;
+  specialize qw/aom_mse8x8            sse2      neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
@@ -1310,8 +1342,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
   add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
-  specialize qw/aom_get_mb_ss sse2 msa/;
-  specialize qw/aom_get4x4sse_cs neon msa/;
+  specialize qw/aom_get_mb_ss sse2/;
+  specialize qw/aom_get4x4sse_cs neon/;
 
   #
   # Variance / Subpixel Variance / Subpixel Avg Variance
@@ -1323,7 +1355,10 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
   add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
-  specialize qw/aom_mse_wxh_16bit  sse2 avx2/;
+  specialize qw/aom_mse_wxh_16bit  sse2 avx2 neon/;
+
+  add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
+  specialize qw/aom_mse_16xh_16bit sse2 avx2/;
 
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
@@ -1332,64 +1367,64 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
-  specialize qw/aom_variance128x128   sse2 avx2 neon    /;
-  specialize qw/aom_variance128x64    sse2 avx2 neon    /;
-  specialize qw/aom_variance64x128    sse2 avx2 neon    /;
-  specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
-  specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
-  specialize qw/aom_variance8x16      sse2      neon msa/;
-  specialize qw/aom_variance8x8       sse2      neon msa/;
-  specialize qw/aom_variance8x4       sse2      neon msa/;
-  specialize qw/aom_variance4x8       sse2      neon msa/;
-  specialize qw/aom_variance4x4       sse2      neon msa/;
-
-  specialize qw/aom_sub_pixel_variance128x128   avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8      avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16           neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x4            neon msa sse2 ssse3/;
-
-  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
+  specialize qw/aom_variance128x128   sse2 avx2 neon/;
+  specialize qw/aom_variance128x64    sse2 avx2 neon/;
+  specialize qw/aom_variance64x128    sse2 avx2 neon/;
+  specialize qw/aom_variance64x64     sse2 avx2 neon/;
+  specialize qw/aom_variance64x32     sse2 avx2 neon/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon/;
+  specialize qw/aom_variance32x32     sse2 avx2 neon/;
+  specialize qw/aom_variance32x16     sse2 avx2 neon/;
+  specialize qw/aom_variance16x32     sse2 avx2 neon/;
+  specialize qw/aom_variance16x16     sse2 avx2 neon/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon/;
+  specialize qw/aom_variance8x16      sse2      neon/;
+  specialize qw/aom_variance8x8       sse2      neon/;
+  specialize qw/aom_variance8x4       sse2      neon/;
+  specialize qw/aom_variance4x8       sse2      neon/;
+  specialize qw/aom_variance4x4       sse2      neon/;
+
+  specialize qw/aom_sub_pixel_variance128x128   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x64     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x32     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16           neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x8            neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4            neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8            neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4            neon sse2 ssse3/;
+
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          neon sse2 ssse3/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    specialize qw/aom_variance4x16 sse2/;
-    specialize qw/aom_variance16x4 sse2 avx2/;
-    specialize qw/aom_variance8x32 sse2/;
-    specialize qw/aom_variance32x8 sse2 avx2/;
-    specialize qw/aom_variance16x64 sse2 avx2/;
-    specialize qw/aom_variance64x16 sse2 avx2/;
+    specialize qw/aom_variance4x16  neon sse2/;
+    specialize qw/aom_variance16x4  neon sse2 avx2/;
+    specialize qw/aom_variance8x32  neon sse2/;
+    specialize qw/aom_variance32x8  neon sse2 avx2/;
+    specialize qw/aom_variance16x64 neon sse2 avx2/;
+    specialize qw/aom_variance64x16 neon sse2 avx2/;
 
     specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
@@ -1397,12 +1432,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
     specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
 
     specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
     specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
@@ -1525,43 +1560,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   }
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8 sse2 ssse3/;
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4 sse2 ssse3/;
 
   #
   # Comp Avg
@@ -2000,6 +2035,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
   }
 
+  # Flow estimation library
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+    specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
+  }
+
 }  # CONFIG_AV1_ENCODER
 
 1;
diff --git a/aom_dsp/aom_simd_inline.h b/aom_dsp/aom_simd_inline.h
index eb333f6f6..b4b1b3563 100644
--- a/aom_dsp/aom_simd_inline.h
+++ b/aom_dsp/aom_simd_inline.h
@@ -18,4 +18,7 @@
 #define SIMD_INLINE static AOM_FORCE_INLINE
 #endif
 
+#define SIMD_CLAMP(value, min, max) \
+  ((value) > (max) ? (max) : (value) < (min) ? (min) : (value))
+
 #endif  // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 9f5b545f2..991fd3f3b 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -89,58 +89,55 @@ int aom_satd_lp_neon(const int16_t *coeff, int length) {
   return horizontal_add_s32x4(accum);
 }
 
-void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
-                          const int ref_stride, const int height) {
-  int i;
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
   const uint8_t *idx = ref;
-  uint16x8_t vec0 = vdupq_n_u16(0);
-  uint16x8_t vec1 = vec0;
-  uint8x16_t tmp;
-
-  for (i = 0; i < height; ++i) {
-    tmp = vld1q_u8(idx);
-    idx += ref_stride;
-    vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
-    vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+  const uint16x8_t zero = vdupq_n_u16(0);
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+
+  for (int wd = 0; wd < width; wd += 16) {
+    uint16x8_t vec0 = zero;
+    uint16x8_t vec1 = zero;
+    idx = ref + wd;
+    for (int ht = 0; ht < height; ++ht) {
+      const uint8x16_t tmp = vld1q_u8(idx);
+      idx += ref_stride;
+      vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+      vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+    }
+
+    const int16x8_t result0 =
+        vshlq_s16(vreinterpretq_s16_u16(vec0), neg_norm_factor);
+    const int16x8_t result1 =
+        vshlq_s16(vreinterpretq_s16_u16(vec1), neg_norm_factor);
+
+    vst1q_s16(hbuf + wd, result0);
+    vst1q_s16(hbuf + wd + 8, result1);
   }
-
-  if (128 == height) {
-    vec0 = vshrq_n_u16(vec0, 6);
-    vec1 = vshrq_n_u16(vec1, 6);
-  } else if (64 == height) {
-    vec0 = vshrq_n_u16(vec0, 5);
-    vec1 = vshrq_n_u16(vec1, 5);
-  } else if (32 == height) {
-    vec0 = vshrq_n_u16(vec0, 4);
-    vec1 = vshrq_n_u16(vec1, 4);
-  } else if (16 == height) {
-    vec0 = vshrq_n_u16(vec0, 3);
-    vec1 = vshrq_n_u16(vec1, 3);
-  }
-
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
 }
 
-int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
-  const uint8_t *idx;
-  uint16x8_t sum = vdupq_n_u16(0);
-
-  for (idx = ref; idx < (ref + width); idx += 16) {
-    uint8x16_t vec = vld1q_u8(idx);
-    sum = vaddq_u16(sum, vpaddlq_u8(vec));
-  }
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  for (int ht = 0; ht < height; ++ht) {
+    uint16x8_t sum = vdupq_n_u16(0);
+    for (int wd = 0; wd < width; wd += 16) {
+      const uint8x16_t vec = vld1q_u8(ref + wd);
+      sum = vaddq_u16(sum, vpaddlq_u8(vec));
+    }
 
 #if defined(__aarch64__)
-  return (int16_t)vaddvq_u16(sum);
+    vbuf[ht] = ((int16_t)vaddvq_u16(sum)) >> norm_factor;
 #else
-  const uint32x4_t a = vpaddlq_u16(sum);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return (int16_t)vget_lane_u32(c, 0);
+    const uint32x4_t a = vpaddlq_u16(sum);
+    const uint64x2_t b = vpaddlq_u32(a);
+    const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                  vreinterpret_u32_u64(vget_high_u64(b)));
+    vbuf[ht] = ((int16_t)vget_lane_u32(c, 0)) >> norm_factor;
 #endif
+    ref += ref_stride;
+  }
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
@@ -165,7 +162,7 @@ int aom_satd_neon(const tran_low_t *coeff, int length) {
   return horizontal_add_s32x4(accum);
 }
 
-int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
   int32x4_t v_mean = vdupq_n_s32(0);
   int32x4_t v_sse = v_mean;
   int16x8_t v_ref, v_src;
@@ -187,10 +184,11 @@ int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
     v_sse = vmlal_s16(v_sse, v_high, v_high);
 #endif
   }
-  int mean = horizontal_add_s32x4(v_mean);
-  int sse = horizontal_add_s32x4(v_sse);
+  const int mean = horizontal_add_s32x4(v_mean);
+  const int sse = horizontal_add_s32x4(v_sse);
+  const unsigned int mean_abs = mean >= 0 ? mean : -mean;
   // (mean * mean): dynamic range 31 bits.
-  int var = sse - ((mean * mean) >> (bwl + 2));
+  const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
   return var;
 }
 
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 939c9a6f2..75dd7d623 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -154,21 +154,27 @@ void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
 
 void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
                              tran_low_t *coeff) {
+  DECLARE_ALIGNED(32, tran_low_t, temp_coeff[16 * 16]);
   /* Rearrange 16x16 to 8x32 and remove stride.
    * Top left first. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
+                        temp_coeff + 0);
   /* Top right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
+                        temp_coeff + 64);
   /* Bottom left. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
+                        temp_coeff + 128);
   /* Bottom right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
+                        temp_coeff + 192);
 
+  tran_low_t *t_coeff = temp_coeff;
   for (int i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
-    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
-    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
-    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
+    const int16x8_t a0 = load_tran_low_to_s16q(t_coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(t_coeff + 64);
+    const int16x8_t a2 = load_tran_low_to_s16q(t_coeff + 128);
+    const int16x8_t a3 = load_tran_low_to_s16q(t_coeff + 192);
 
     const int16x8_t b0 = vhaddq_s16(a0, a1);
     const int16x8_t b1 = vhsubq_s16(a0, a1);
@@ -180,11 +186,12 @@ void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     const int16x8_t c2 = vsubq_s16(b0, b2);
     const int16x8_t c3 = vsubq_s16(b1, b3);
 
-    store_s16q_to_tran_low(coeff + 0, c0);
-    store_s16q_to_tran_low(coeff + 64, c1);
-    store_s16q_to_tran_low(coeff + 128, c2);
-    store_s16q_to_tran_low(coeff + 192, c3);
+    store_s16q_to_tran_low_offset_4(coeff + 0, c0);
+    store_s16q_to_tran_low_offset_4(coeff + 64, c1);
+    store_s16q_to_tran_low_offset_4(coeff + 128, c2);
+    store_s16q_to_tran_low_offset_4(coeff + 192, c3);
 
-    coeff += 8;
+    t_coeff += 8;
+    coeff += (4 + (((i >> 3) & 1) << 3));
   }
 }
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 7544777af..fa2f11e4b 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -503,7 +503,7 @@ HIGHBD_SMOOTH_NXM(8, 32)
     /* Precompute weighted values that don't vary with |y|. */                 \
     uint32x4_t weighted_tr_low[(W) >> 3];                                      \
     uint32x4_t weighted_tr_high[(W) >> 3];                                     \
-    for (int i = 0; i<(W)>> 3; ++i) {                                          \
+    for (int i = 0; i < (W) >> 3; ++i) {                                       \
       const int x = i << 3;                                                    \
       const uint16x4_t weights_x_low =                                         \
           vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \
@@ -518,7 +518,7 @@ HIGHBD_SMOOTH_NXM(8, 32)
       const uint32x4_t weighted_bl =                                           \
           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
       uint16_t *dst_x = dst;                                                   \
-      for (int i = 0; i<(W)>> 3; ++i) {                                        \
+      for (int i = 0; i < (W) >> 3; ++i) {                                     \
         const int x = i << 3;                                                  \
         const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
                                           vld1_u16(top_row + x + 4) } };       \
@@ -644,7 +644,7 @@ HIGHBD_SMOOTH_V_NXM(8, 32)
     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
                                                                              \
     uint16x4x2_t top_vals[(W) >> 3];                                         \
-    for (int i = 0; i<(W)>> 3; ++i) {                                        \
+    for (int i = 0; i < (W) >> 3; ++i) {                                     \
       const int x = i << 3;                                                  \
       top_vals[i].val[0] = vld1_u16(top_row + x);                            \
       top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
@@ -656,7 +656,7 @@ HIGHBD_SMOOTH_V_NXM(8, 32)
           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
                                                                              \
       uint16_t *dst_x = dst;                                                 \
-      for (int i = 0; i<(W)>> 3; ++i) {                                      \
+      for (int i = 0; i < (W) >> 3; ++i) {                                   \
         const uint32x4_t weighted_top_low =                                  \
             vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
         vst1_u16(dst_x,                                                      \
@@ -776,7 +776,7 @@ HIGHBD_SMOOTH_H_NXM(8, 32)
     uint16x4_t weights_x_high[(W) >> 3];                                      \
     uint32x4_t weighted_tr_low[(W) >> 3];                                     \
     uint32x4_t weighted_tr_high[(W) >> 3];                                    \
-    for (int i = 0; i<(W)>> 3; ++i) {                                         \
+    for (int i = 0; i < (W) >> 3; ++i) {                                      \
       const int x = i << 3;                                                   \
       weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \
       weighted_tr_low[i] =                                                    \
@@ -789,7 +789,7 @@ HIGHBD_SMOOTH_H_NXM(8, 32)
     for (int y = 0; y < height; ++y) {                                        \
       uint16_t *dst_x = dst;                                                  \
       const uint16_t left_y = left_column[y];                                 \
-      for (int i = 0; i<(W)>> 3; ++i) {                                       \
+      for (int i = 0; i < (W) >> 3; ++i) {                                    \
         const uint32x4_t weighted_left_low =                                  \
             vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
         vst1_u16(dst_x,                                                       \
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 81643e9dc..73a512732 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -15,6 +15,64 @@
 #include <string.h>
 #include "aom_dsp/aom_dsp_common.h"
 
+// Support for xN Neon intrinsics is lacking in some compilers.
+#if defined(__arm__) || defined(_M_ARM)
+#define ARM_32_BIT
+#endif
+
+// DEFICIENT_CLANG_32_BIT includes clang-cl.
+#if defined(__clang__) && defined(ARM_32_BIT) && \
+    (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
+#define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
+#define GCC_32_BIT
+#endif
+
+#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
+
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+
+#elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
+#if __GNUC__ < 8
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+#endif  // __GNUC__ < 8
+
+#if __GNUC__ < 9
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif  // __GNUC__ < 9
+#endif  // defined(__GNUC__) && !defined(__clang__)
+
 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
                                      const uint8x8_t s1) {
   vst1_u8(s, s0);
@@ -316,14 +374,25 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1q_s16(s);
 }
 
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  memcpy(&a, buf, 4);
+  buf += stride;
+  uint32x2_t a_u32 = vdup_n_u32(a);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
+  uint32x4_t a_u32;
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  a_u32 = vdupq_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -331,7 +400,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 2);
   memcpy(&a, buf, 4);
-  buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 3);
   return vreinterpretq_u8_u32(a_u32);
 }
@@ -343,25 +411,25 @@ static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
+  *tu0 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu0 = vset_lane_u32(a, *tu0, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu1 = vset_lane_u32(a, *tu1, 0);
+  *tu1 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu1 = vset_lane_u32(a, *tu1, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu2 = vset_lane_u32(a, *tu2, 0);
+  *tu2 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu2 = vset_lane_u32(a, *tu2, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu3 = vset_lane_u32(a, *tu3, 0);
+  *tu3 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   *tu3 = vset_lane_u32(a, *tu3, 1);
 }
@@ -372,13 +440,13 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
+  *tu0 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu0 = vset_lane_u32(a, *tu0, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu1 = vset_lane_u32(a, *tu1, 0);
+  *tu1 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   *tu1 = vset_lane_u32(a, *tu1, 1);
 }
@@ -398,9 +466,8 @@ static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
+  *tu0 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
-  buf += stride;
   *tu0 = vset_lane_u32(a, *tu0, 1);
 }
 
@@ -426,9 +493,8 @@ static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
 
   memcpy(&a, buf, 2);
   buf += stride;
-  *tu0 = vset_lane_u16(a, *tu0, 0);
+  *tu0 = vdup_n_u16(a);
   memcpy(&a, buf, 2);
-  buf += stride;
   *tu0 = vset_lane_u16(a, *tu0, 1);
 }
 
@@ -472,13 +538,13 @@ static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
 
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu0 = vsetq_lane_u64(a, *tu0, 0);
+  *tu0 = vdupq_n_u64(a);
   memcpy(&a, buf, 8);
   buf += stride;
   *tu0 = vsetq_lane_u64(a, *tu0, 1);
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu1 = vsetq_lane_u64(a, *tu1, 0);
+  *tu1 = vdupq_n_u64(a);
   memcpy(&a, buf, 8);
   *tu1 = vsetq_lane_u64(a, *tu1, 1);
 }
@@ -543,4 +609,17 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
   vst1q_s32(buf + 4, v1);
 }
 
+// Stores the second result at an offset of 8 (instead of 4) to match the output
+// with that of C implementation and the function is similar to
+// store_s16q_to_tran_low(). The offset in the function name signifies that
+// pointer should be incremented by at least 4 in the calling function after
+// store_s16q_to_tran_low_offset_4() call.
+static INLINE void store_s16q_to_tran_low_offset_4(tran_low_t *buf,
+                                                   const int16x8_t a) {
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 8, v1);
+}
+
 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index b62628e1e..e1eccc356 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -17,550 +17,518 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
-                             vget_low_u8(vec_ref_32));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
-                             vget_high_u8(vec_ref_32));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
-                             vget_low_u8(vec_ref_48));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
-                             vget_high_u8(vec_ref_48));
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
 }
 
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *const ref[4], int ref_stride,
+                                    uint32_t res[4], int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    const uint8x16_t s4 = vld1q_u8(src + i * src_stride + 64);
+    sad16_neon(s4, vld1q_u8(ref[0] + i * ref_stride + 64), &sum_lo[0]);
+    sad16_neon(s4, vld1q_u8(ref[1] + i * ref_stride + 64), &sum_lo[1]);
+    sad16_neon(s4, vld1q_u8(ref[2] + i * ref_stride + 64), &sum_lo[2]);
+    sad16_neon(s4, vld1q_u8(ref[3] + i * ref_stride + 64), &sum_lo[3]);
+
+    const uint8x16_t s5 = vld1q_u8(src + i * src_stride + 80);
+    sad16_neon(s5, vld1q_u8(ref[0] + i * ref_stride + 80), &sum_hi[0]);
+    sad16_neon(s5, vld1q_u8(ref[1] + i * ref_stride + 80), &sum_hi[1]);
+    sad16_neon(s5, vld1q_u8(ref[2] + i * ref_stride + 80), &sum_hi[2]);
+    sad16_neon(s5, vld1q_u8(ref[3] + i * ref_stride + 80), &sum_hi[3]);
+
+    const uint8x16_t s6 = vld1q_u8(src + i * src_stride + 96);
+    sad16_neon(s6, vld1q_u8(ref[0] + i * ref_stride + 96), &sum_lo[0]);
+    sad16_neon(s6, vld1q_u8(ref[1] + i * ref_stride + 96), &sum_lo[1]);
+    sad16_neon(s6, vld1q_u8(ref[2] + i * ref_stride + 96), &sum_lo[2]);
+    sad16_neon(s6, vld1q_u8(ref[3] + i * ref_stride + 96), &sum_lo[3]);
+
+    const uint8x16_t s7 = vld1q_u8(src + i * src_stride + 112);
+    sad16_neon(s7, vld1q_u8(ref[0] + i * ref_stride + 112), &sum_hi[0]);
+    sad16_neon(s7, vld1q_u8(ref[1] + i * ref_stride + 112), &sum_hi[1]);
+    sad16_neon(s7, vld1q_u8(ref[2] + i * ref_stride + 112), &sum_hi[2]);
+    sad16_neon(s7, vld1q_u8(ref[3] + i * ref_stride + 112), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  uint32x4_t res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
+                               vaddq_u32(sum_lo[1], sum_hi[1]));
+  uint32x4_t res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
+                               vaddq_u32(sum_lo[3], sum_hi[3]));
+  vst1q_u32(res, vpaddq_u32(res0, res1));
 }
 
-void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  uint32x4_t res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
+                               vaddq_u32(sum_lo[1], sum_hi[1]));
+  uint32x4_t res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
+                               vaddq_u32(sum_lo[3], sum_hi[3]));
+  vst1q_u32(res, vpaddq_u32(res0, res1));
+}
 
-  res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  uint32x4_t res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]),
+                               vaddq_u32(sum_lo[1], sum_hi[1]));
+  uint32x4_t res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]),
+                               vaddq_u32(sum_lo[3], sum_hi[3]));
+  vst1q_u32(res, vpaddq_u32(res0, res1));
 }
 
-void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-
-    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+    i++;
+  } while (i < h);
+
+  uint32x4_t res0 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(res0, res1));
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
-  res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
 }
 
-void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
-    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
-    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
-    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-
-    vec_sum_ref0_lo =
-        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
-    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo =
-        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
-    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo =
-        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
-    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo =
-        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
-    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref3));
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *const ref[4], int ref_stride,
+                                    uint32_t res[4], int h) {
+  vst1q_u32(res, vdupq_n_u32(0));
+  int h_tmp = h > 32 ? 32 : h;
+
+  int i = 0;
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
+      sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+      sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+      const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+      sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+      const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
+      sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+      sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+      sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+      sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+      const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
+      sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+      sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+      sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+      sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+      const uint8x16_t s4 = vld1q_u8(src + i * src_stride + 64);
+      sad16_neon(s4, vld1q_u8(ref[0] + i * ref_stride + 64), &sum_lo[0]);
+      sad16_neon(s4, vld1q_u8(ref[1] + i * ref_stride + 64), &sum_lo[1]);
+      sad16_neon(s4, vld1q_u8(ref[2] + i * ref_stride + 64), &sum_lo[2]);
+      sad16_neon(s4, vld1q_u8(ref[3] + i * ref_stride + 64), &sum_lo[3]);
+
+      const uint8x16_t s5 = vld1q_u8(src + i * src_stride + 80);
+      sad16_neon(s5, vld1q_u8(ref[0] + i * ref_stride + 80), &sum_hi[0]);
+      sad16_neon(s5, vld1q_u8(ref[1] + i * ref_stride + 80), &sum_hi[1]);
+      sad16_neon(s5, vld1q_u8(ref[2] + i * ref_stride + 80), &sum_hi[2]);
+      sad16_neon(s5, vld1q_u8(ref[3] + i * ref_stride + 80), &sum_hi[3]);
+
+      const uint8x16_t s6 = vld1q_u8(src + i * src_stride + 96);
+      sad16_neon(s6, vld1q_u8(ref[0] + i * ref_stride + 96), &sum_lo[0]);
+      sad16_neon(s6, vld1q_u8(ref[1] + i * ref_stride + 96), &sum_lo[1]);
+      sad16_neon(s6, vld1q_u8(ref[2] + i * ref_stride + 96), &sum_lo[2]);
+      sad16_neon(s6, vld1q_u8(ref[3] + i * ref_stride + 96), &sum_lo[3]);
+
+      const uint8x16_t s7 = vld1q_u8(src + i * src_stride + 112);
+      sad16_neon(s7, vld1q_u8(ref[0] + i * ref_stride + 112), &sum_hi[0]);
+      sad16_neon(s7, vld1q_u8(ref[1] + i * ref_stride + 112), &sum_hi[1]);
+      sad16_neon(s7, vld1q_u8(ref[2] + i * ref_stride + 112), &sum_hi[2]);
+      sad16_neon(s7, vld1q_u8(ref[3] + i * ref_stride + 112), &sum_hi[3]);
+
+      i++;
+    } while (i < h_tmp);
+
+    res[0] += horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
+    res[1] += horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
+    res[2] += horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
+    res[3] += horizontal_long_add_u16x8(sum_lo[3], sum_hi[3]);
+
+    h_tmp += 32;
+  } while (i < h);
+}
 
-  res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  vst1q_u32(res, vdupq_n_u32(0));
+  int h_tmp = h > 64 ? 64 : h;
+
+  int i = 0;
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
+      sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+      sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+      const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+      sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+      const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
+      sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+      sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+      sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+      sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+      const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
+      sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+      sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+      sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+      sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+      i++;
+    } while (i < h_tmp);
+
+    res[0] += horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
+    res[1] += horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
+    res[2] += horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
+    res[3] += horizontal_long_add_u16x8(sum_lo[3], sum_hi[3]);
+
+    h_tmp += 64;
+  } while (i < h);
 }
 
-static void sad_row4_neon(uint16x4_t *vec_src, const uint8x8_t q0,
-                          const uint8x8_t ref) {
-  uint8x8_t q2 = vabd_u8(q0, ref);
-  *vec_src = vpadal_u8(*vec_src, q2);
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
+  res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
+  res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
+  res[3] = horizontal_long_add_u16x8(sum_lo[3], sum_hi[3]);
 }
 
-static void sad_row8_neon(uint16x4_t *vec_src, const uint8x8_t *q0,
-                          const uint8_t *ref_ptr) {
-  uint8x8_t q1 = vld1_u8(ref_ptr);
-  uint8x8_t q2 = vabd_u8(*q0, q1);
-  *vec_src = vpadal_u8(*vec_src, q2);
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_add_u16x8(sum[0]);
+  res[1] = horizontal_add_u16x8(sum[1]);
+  res[2] = horizontal_add_u16x8(sum[2]);
+  res[3] = horizontal_add_u16x8(sum[3]);
 }
 
-static void sad_row16_neon(uint16x8_t *vec_src, const uint8x16_t *q0,
-                           const uint8_t *ref_ptr) {
-  uint8x16_t q1 = vld1q_u8(ref_ptr);
-  uint8x16_t q2 = vabdq_u8(*q0, q1);
-  *vec_src = vpadalq_u8(*vec_src, q2);
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+                             uint16x8_t *const sad_sum) {
+  uint8x8_t abs_diff = vabd_u8(src, ref);
+  *sad_sum = vaddw_u8(*sad_sum, abs_diff);
 }
 
-void aom_sadMxNx4d_neon(int width, int height, const uint8_t *src,
-                        int src_stride, const uint8_t *const ref[4],
-                        int ref_stride, uint32_t res[4]) {
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  res[0] = 0;
-  res[1] = 0;
-  res[2] = 0;
-  res[3] = 0;
-
-  switch (width) {
-    case 4: {
-      uint32_t src4, ref40, ref41, ref42, ref43;
-      uint32x2_t q8 = vdup_n_u32(0);
-      uint32x2_t q4 = vdup_n_u32(0);
-      uint32x2_t q5 = vdup_n_u32(0);
-      uint32x2_t q6 = vdup_n_u32(0);
-      uint32x2_t q7 = vdup_n_u32(0);
-
-      for (int i = 0; i < height / 2; i++) {
-        uint16x4_t q0 = vdup_n_u16(0);
-        uint16x4_t q1 = vdup_n_u16(0);
-        uint16x4_t q2 = vdup_n_u16(0);
-        uint16x4_t q3 = vdup_n_u16(0);
-
-        memcpy(&src4, src, 4);
-        memcpy(&ref40, ref0, 4);
-        memcpy(&ref41, ref1, 4);
-        memcpy(&ref42, ref2, 4);
-        memcpy(&ref43, ref3, 4);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        q8 = vset_lane_u32(src4, q8, 0);
-        q4 = vset_lane_u32(ref40, q4, 0);
-        q5 = vset_lane_u32(ref41, q5, 0);
-        q6 = vset_lane_u32(ref42, q6, 0);
-        q7 = vset_lane_u32(ref43, q7, 0);
-
-        memcpy(&src4, src, 4);
-        memcpy(&ref40, ref0, 4);
-        memcpy(&ref41, ref1, 4);
-        memcpy(&ref42, ref2, 4);
-        memcpy(&ref43, ref3, 4);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        q8 = vset_lane_u32(src4, q8, 1);
-        q4 = vset_lane_u32(ref40, q4, 1);
-        q5 = vset_lane_u32(ref41, q5, 1);
-        q6 = vset_lane_u32(ref42, q6, 1);
-        q7 = vset_lane_u32(ref43, q7, 1);
-
-        sad_row4_neon(&q0, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q4));
-        sad_row4_neon(&q1, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q5));
-        sad_row4_neon(&q2, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q6));
-        sad_row4_neon(&q3, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q7));
-
-        res[0] += horizontal_add_u16x4(q0);
-        res[1] += horizontal_add_u16x4(q1);
-        res[2] += horizontal_add_u16x4(q2);
-        res[3] += horizontal_add_u16x4(q3);
-      }
-      break;
-    }
-    case 8: {
-      for (int i = 0; i < height; i++) {
-        uint16x4_t q0 = vdup_n_u16(0);
-        uint16x4_t q1 = vdup_n_u16(0);
-        uint16x4_t q2 = vdup_n_u16(0);
-        uint16x4_t q3 = vdup_n_u16(0);
-
-        uint8x8_t q5 = vld1_u8(src);
-
-        sad_row8_neon(&q0, &q5, ref0);
-        sad_row8_neon(&q1, &q5, ref1);
-        sad_row8_neon(&q2, &q5, ref2);
-        sad_row8_neon(&q3, &q5, ref3);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_u16x4(q0);
-        res[1] += horizontal_add_u16x4(q1);
-        res[2] += horizontal_add_u16x4(q2);
-        res[3] += horizontal_add_u16x4(q3);
-      }
-      break;
-    }
-    case 16: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_u16x8(q0);
-        res[1] += horizontal_add_u16x8(q1);
-        res[2] += horizontal_add_u16x8(q2);
-        res[3] += horizontal_add_u16x8(q3);
-      }
-      break;
-    }
-    case 32: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        q4 = vld1q_u8(src + 16);
-
-        sad_row16_neon(&q0, &q4, ref0 + 16);
-        sad_row16_neon(&q1, &q4, ref1 + 16);
-        sad_row16_neon(&q2, &q4, ref2 + 16);
-        sad_row16_neon(&q3, &q4, ref3 + 16);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_u16x8(q0);
-        res[1] += horizontal_add_u16x8(q1);
-        res[2] += horizontal_add_u16x8(q2);
-        res[3] += horizontal_add_u16x8(q3);
-      }
-      break;
-    }
-    case 64: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        q4 = vld1q_u8(src + 16);
-
-        sad_row16_neon(&q0, &q4, ref0 + 16);
-        sad_row16_neon(&q1, &q4, ref1 + 16);
-        sad_row16_neon(&q2, &q4, ref2 + 16);
-        sad_row16_neon(&q3, &q4, ref3 + 16);
-
-        q4 = vld1q_u8(src + 32);
-
-        sad_row16_neon(&q0, &q4, ref0 + 32);
-        sad_row16_neon(&q1, &q4, ref1 + 32);
-        sad_row16_neon(&q2, &q4, ref2 + 32);
-        sad_row16_neon(&q3, &q4, ref3 + 32);
-
-        q4 = vld1q_u8(src + 48);
-
-        sad_row16_neon(&q0, &q4, ref0 + 48);
-        sad_row16_neon(&q1, &q4, ref1 + 48);
-        sad_row16_neon(&q2, &q4, ref2 + 48);
-        sad_row16_neon(&q3, &q4, ref3 + 48);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_u16x8(q0);
-        res[1] += horizontal_add_u16x8(q1);
-        res[2] += horizontal_add_u16x8(q2);
-        res[3] += horizontal_add_u16x8(q3);
-      }
-      break;
-    }
-    case 128: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        q4 = vld1q_u8(src + 16);
-
-        sad_row16_neon(&q0, &q4, ref0 + 16);
-        sad_row16_neon(&q1, &q4, ref1 + 16);
-        sad_row16_neon(&q2, &q4, ref2 + 16);
-        sad_row16_neon(&q3, &q4, ref3 + 16);
-
-        q4 = vld1q_u8(src + 32);
-
-        sad_row16_neon(&q0, &q4, ref0 + 32);
-        sad_row16_neon(&q1, &q4, ref1 + 32);
-        sad_row16_neon(&q2, &q4, ref2 + 32);
-        sad_row16_neon(&q3, &q4, ref3 + 32);
-
-        q4 = vld1q_u8(src + 48);
-
-        sad_row16_neon(&q0, &q4, ref0 + 48);
-        sad_row16_neon(&q1, &q4, ref1 + 48);
-        sad_row16_neon(&q2, &q4, ref2 + 48);
-        sad_row16_neon(&q3, &q4, ref3 + 48);
-
-        q4 = vld1q_u8(src + 64);
-
-        sad_row16_neon(&q0, &q4, ref0 + 64);
-        sad_row16_neon(&q1, &q4, ref1 + 64);
-        sad_row16_neon(&q2, &q4, ref2 + 64);
-        sad_row16_neon(&q3, &q4, ref3 + 64);
-
-        q4 = vld1q_u8(src + 80);
-
-        sad_row16_neon(&q0, &q4, ref0 + 80);
-        sad_row16_neon(&q1, &q4, ref1 + 80);
-        sad_row16_neon(&q2, &q4, ref2 + 80);
-        sad_row16_neon(&q3, &q4, ref3 + 80);
-
-        q4 = vld1q_u8(src + 96);
-
-        sad_row16_neon(&q0, &q4, ref0 + 96);
-        sad_row16_neon(&q1, &q4, ref1 + 96);
-        sad_row16_neon(&q2, &q4, ref2 + 96);
-        sad_row16_neon(&q3, &q4, ref3 + 96);
-
-        q4 = vld1q_u8(src + 112);
-
-        sad_row16_neon(&q0, &q4, ref0 + 112);
-        sad_row16_neon(&q1, &q4, ref1 + 112);
-        sad_row16_neon(&q2, &q4, ref2 + 112);
-        sad_row16_neon(&q3, &q4, ref3 + 112);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_u16x8(q0);
-        res[1] += horizontal_add_u16x8(q1);
-        res[2] += horizontal_add_u16x8(q2);
-        res[3] += horizontal_add_u16x8(q3);
-      }
-    }
-  }
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    const uint8x8_t s = vld1_u8(src + i * src_stride);
+    sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_add_u16x8(sum[0]);
+  res[1] = horizontal_add_u16x8(sum[1]);
+  res[2] = horizontal_add_u16x8(sum[2]);
+  res[3] = horizontal_add_u16x8(sum[3]);
 }
 
-#define SAD_SKIP_MXN_NEON(m, n)                                             \
-  void aom_sad_skip_##m##x##n##x4d_neon(const uint8_t *src, int src_stride, \
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint32x2_t s, r0, r1, r2, r3;
+    uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi;
+
+    memcpy(&s_lo, src + i * src_stride, 4);
+    memcpy(&r0_lo, ref[0] + i * ref_stride, 4);
+    memcpy(&r1_lo, ref[1] + i * ref_stride, 4);
+    memcpy(&r2_lo, ref[2] + i * ref_stride, 4);
+    memcpy(&r3_lo, ref[3] + i * ref_stride, 4);
+    s = vdup_n_u32(s_lo);
+    r0 = vdup_n_u32(r0_lo);
+    r1 = vdup_n_u32(r1_lo);
+    r2 = vdup_n_u32(r2_lo);
+    r3 = vdup_n_u32(r3_lo);
+
+    memcpy(&s_hi, src + (i + 1) * src_stride, 4);
+    memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4);
+    memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4);
+    memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4);
+    memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4);
+    s = vset_lane_u32(s_hi, s, 1);
+    r0 = vset_lane_u32(r0_hi, r0, 1);
+    r1 = vset_lane_u32(r1_hi, r1, 1);
+    r2 = vset_lane_u32(r2_hi, r2, 1);
+    r3 = vset_lane_u32(r3_hi, r3, 1);
+
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]);
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]);
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]);
+    sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]);
+
+    i += 2;
+  } while (i < h);
+
+  res[0] = horizontal_add_u16x8(sum[0]);
+  res[1] = horizontal_add_u16x8(sum[1]);
+  res[2] = horizontal_add_u16x8(sum[2]);
+  res[3] = horizontal_add_u16x8(sum[3]);
+}
+
+#define SAD_WXH_4D_NEON(w, h)                                                  \
+  void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
+  }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+SAD_WXH_4D_NEON(4, 16)
+SAD_WXH_4D_NEON(4, 32)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+SAD_WXH_4D_NEON(8, 32)
+
+SAD_WXH_4D_NEON(16, 4)
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+SAD_WXH_4D_NEON(16, 64)
+
+SAD_WXH_4D_NEON(32, 8)
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 16)
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+SAD_WXH_4D_NEON(64, 128)
+
+SAD_WXH_4D_NEON(128, 64)
+SAD_WXH_4D_NEON(128, 128)
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
+  void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
                                         const uint8_t *const ref[4],        \
                                         int ref_stride, uint32_t res[4]) {  \
-    aom_sadMxNx4d_neon(m, ((n) >> 1), src, 2 * src_stride, ref,             \
-                       2 * ref_stride, res);                                \
+    sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
+                       ((h) >> 1));                                         \
     res[0] <<= 1;                                                           \
     res[1] <<= 1;                                                           \
     res[2] <<= 1;                                                           \
     res[3] <<= 1;                                                           \
   }
 
-SAD_SKIP_MXN_NEON(4, 8)
-SAD_SKIP_MXN_NEON(4, 16)
-SAD_SKIP_MXN_NEON(4, 32)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+SAD_SKIP_WXH_4D_NEON(4, 16)
+SAD_SKIP_WXH_4D_NEON(4, 32)
 
-SAD_SKIP_MXN_NEON(8, 8)
-SAD_SKIP_MXN_NEON(8, 16)
-SAD_SKIP_MXN_NEON(8, 32)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+SAD_SKIP_WXH_4D_NEON(8, 32)
 
-SAD_SKIP_MXN_NEON(16, 8)
-SAD_SKIP_MXN_NEON(16, 16)
-SAD_SKIP_MXN_NEON(16, 32)
-SAD_SKIP_MXN_NEON(16, 64)
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+SAD_SKIP_WXH_4D_NEON(16, 64)
 
-SAD_SKIP_MXN_NEON(32, 8)
-SAD_SKIP_MXN_NEON(32, 16)
-SAD_SKIP_MXN_NEON(32, 32)
-SAD_SKIP_MXN_NEON(32, 64)
+SAD_SKIP_WXH_4D_NEON(32, 8)
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
 
-SAD_SKIP_MXN_NEON(64, 16)
-SAD_SKIP_MXN_NEON(64, 32)
-SAD_SKIP_MXN_NEON(64, 64)
-SAD_SKIP_MXN_NEON(64, 128)
+SAD_SKIP_WXH_4D_NEON(64, 16)
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+SAD_SKIP_WXH_4D_NEON(64, 128)
 
-SAD_SKIP_MXN_NEON(128, 64)
-SAD_SKIP_MXN_NEON(128, 128)
+SAD_SKIP_WXH_4D_NEON(128, 64)
+SAD_SKIP_WXH_4D_NEON(128, 128)
 
-#undef SAD_SKIP_MXN_NEON
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index acd2c5489..5ba7f1098 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -15,531 +15,785 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 15; i++) {
-    d0 = vld1_u8(src_ptr);
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
     src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
+  } while (--i != 0);
 
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x2_t d1;
-  uint64x1_t d3;
-  int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 3; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
 
-  d1 = vpaddl_u16(vget_low_u16(q12));
-  d3 = vpaddl_u32(d1);
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
 
-  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
 }
 
-unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride) {
-  uint8x16_t q0, q4;
-  uint16x8_t q12, q13;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
-
-  q0 = vld1q_u8(src_ptr);
-  src_ptr += src_stride;
-  q4 = vld1q_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-  for (i = 0; i < 7; i++) {
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-  }
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-  q12 = vaddq_u16(q12, q13);
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
 
-  return vget_lane_u32(d5, 0);
-}
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
 
-unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
-  }
-  return horizontal_long_add_u16x8(vec_accum_lo, vec_accum_hi);
-}
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
 
-unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride) {
-  uint16x8_t vec_accum_lo, vec_accum_hi;
-  uint32x4_t vec_accum_32lo = vdupq_n_u32(0);
-  uint32x4_t vec_accum_32hi = vdupq_n_u32(0);
-  uint16x8_t tmp;
-  for (int i = 0; i < 128; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_src_64 = vld1q_u8(src + 64);
-    const uint8x16_t vec_src_80 = vld1q_u8(src + 80);
-    const uint8x16_t vec_src_96 = vld1q_u8(src + 96);
-    const uint8x16_t vec_src_112 = vld1q_u8(src + 112);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    const uint8x16_t vec_ref_64 = vld1q_u8(ref + 64);
-    const uint8x16_t vec_ref_80 = vld1q_u8(ref + 80);
-    const uint8x16_t vec_ref_96 = vld1q_u8(ref + 96);
-    const uint8x16_t vec_ref_112 = vld1q_u8(ref + 112);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vdupq_n_u16(0);
-    vec_accum_hi = vdupq_n_u16(0);
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_64),
-                            vget_low_u8(vec_ref_64));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_64),
-                            vget_high_u8(vec_ref_64));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_80),
-                            vget_low_u8(vec_ref_80));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_80),
-                            vget_high_u8(vec_ref_80));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_96),
-                            vget_low_u8(vec_ref_96));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_96),
-                            vget_high_u8(vec_ref_96));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_112),
-                            vget_low_u8(vec_ref_112));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_112),
-                            vget_high_u8(vec_ref_112));
-
-    tmp = vaddq_u16(vec_accum_lo, vec_accum_hi);
-    vec_accum_32lo = vaddw_u16(vec_accum_32lo, vget_low_u16(tmp));
-    vec_accum_32hi = vaddw_u16(vec_accum_32hi, vget_high_u16(tmp));
-  }
-  const uint32x4_t a = vaddq_u32(vec_accum_32lo, vec_accum_32hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
 
-unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-  }
-  return horizontal_add_u16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
 
-unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref = vld1q_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo =
-        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi =
-        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
-  }
-  return horizontal_add_u16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum = vdupq_n_u16(0);
-
-  for (i = 0; i < 8; ++i) {
-    const uint8x8_t vec_src = vld1_u8(src);
-    const uint8x8_t vec_ref = vld1_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
-  }
-  return horizontal_add_u16x8(vec_accum);
-}
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *ref_ptr, int ref_stride,
                                          int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint16x8_t q3 = vdupq_n_u16(0);
-
-    uint8x16_t q0 = vld1q_u8(src_ptr);
-    uint8x16_t q1 = vld1q_u8(ref_ptr);
-    uint8x16_t q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 16);
-    q1 = vld1q_u8(ref_ptr + 16);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 32);
-    q1 = vld1q_u8(ref_ptr + 32);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 48);
-    q1 = vld1q_u8(ref_ptr + 48);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 64);
-    q1 = vld1q_u8(ref_ptr + 64);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 80);
-    q1 = vld1q_u8(ref_ptr + 80);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 96);
-    q1 = vld1q_u8(ref_ptr + 96);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 112);
-    q1 = vld1q_u8(ref_ptr + 112);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+    uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    s4 = vld1q_u8(src_ptr + 64);
+    r4 = vld1q_u8(ref_ptr + 64);
+    diff4 = vabdq_u8(s4, r4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    s5 = vld1q_u8(src_ptr + 80);
+    r5 = vld1q_u8(ref_ptr + 80);
+    diff5 = vabdq_u8(s5, r5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    s6 = vld1q_u8(src_ptr + 96);
+    r6 = vld1q_u8(ref_ptr + 96);
+    diff6 = vabdq_u8(s6, r6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    s7 = vld1q_u8(src_ptr + 112);
+    r7 = vld1q_u8(ref_ptr + 112);
+    diff7 = vabdq_u8(s7, r7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-
-    sum += horizontal_add_u16x8(q3);
-  }
-
-  return sum;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
 }
 
 static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint16x8_t q3 = vdupq_n_u16(0);
-
-    uint8x16_t q0 = vld1q_u8(src_ptr);
-    uint8x16_t q1 = vld1q_u8(ref_ptr);
-    uint8x16_t q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 16);
-    q1 = vld1q_u8(ref_ptr + 16);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 32);
-    q1 = vld1q_u8(ref_ptr + 32);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 48);
-    q1 = vld1q_u8(ref_ptr + 48);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+    uint8x16_t diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
+  } while (--i != 0);
 
-    sum += horizontal_add_u16x8(q3);
-  }
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
 
-  return sum;
+  return horizontal_add_u32x4(sum_u32);
 }
 
 static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint16x8_t q3 = vdupq_n_u16(0);
+  uint32x4_t sum = vdupq_n_u32(0);
 
-    uint8x16_t q0 = vld1q_u8(src_ptr);
-    uint8x16_t q1 = vld1q_u8(ref_ptr);
-    uint8x16_t q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t diff0 = vabdq_u8(s0, r0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
 
-    q0 = vld1q_u8(src_ptr + 16);
-    q1 = vld1q_u8(ref_ptr + 16);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t diff1 = vabdq_u8(s1, r1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
 
-    sum += horizontal_add_u16x8(q3);
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  return sum;
+  return horizontal_add_u32x4(sum);
 }
 
 static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint8x8_t q0 = vld1_u8(src_ptr);
-    uint8x8_t q1 = vld1_u8(ref_ptr);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
-    q0 = vld1_u8(src_ptr + 8);
-    q1 = vld1_u8(ref_ptr + 8);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sum = vpadalq_u8(sum, diff);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  return sum;
+  return horizontal_add_u16x8(sum);
 }
 
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
-  uint16x8_t q3 = vdupq_n_u16(0);
-  for (int y = 0; y < h; y++) {
-    uint8x8_t q0 = vld1_u8(src_ptr);
-    uint8x8_t q1 = vld1_u8(ref_ptr);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+
+    sum = vabal_u8(sum, s, r);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    q3 = vabal_u8(q3, q0, q1);
-  }
-  return horizontal_add_u16x8(q3);
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
 }
 
 static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
-  uint16x8_t q3 = vdupq_n_u16(0);
-  uint32x2_t q0 = vdup_n_u32(0);
-  uint32x2_t q1 = vdup_n_u32(0);
-  uint32_t src4, ref4;
-  for (int y = 0; y < h / 2; y++) {
-    memcpy(&src4, src_ptr, 4);
-    memcpy(&ref4, ref_ptr, 4);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint32x2_t s, r;
+    uint32_t s0, s1, r0, r1;
+
+    memcpy(&s0, src_ptr, 4);
+    memcpy(&r0, ref_ptr, 4);
+    s = vdup_n_u32(s0);
+    r = vdup_n_u32(r0);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    q0 = vset_lane_u32(src4, q0, 0);
-    q1 = vset_lane_u32(ref4, q1, 0);
 
-    memcpy(&src4, src_ptr, 4);
-    memcpy(&ref4, ref_ptr, 4);
+    memcpy(&s1, src_ptr, 4);
+    memcpy(&r1, ref_ptr, 4);
+    s = vset_lane_u32(s1, s, 1);
+    r = vset_lane_u32(r1, r, 1);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    q0 = vset_lane_u32(src4, q0, 1);
-    q1 = vset_lane_u32(ref4, q1, 1);
 
-    q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1));
-  }
-  return horizontal_add_u16x8(q3);
+    sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
 }
 
-#define FSADS128_H(h)                                                    \
-  unsigned int aom_sad_skip_128x##h##_neon(                              \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,    \
-      int ref_stride) {                                                  \
-    const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \
-                                       2 * ref_stride, h / 2);           \
-    return 2 * sum;                                                      \
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
   }
 
-FSADS128_H(128)
-FSADS128_H(64)
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+SAD_WXH_NEON(4, 16)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+SAD_WXH_NEON(8, 32)
+
+SAD_WXH_NEON(16, 4)
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+SAD_WXH_NEON(16, 64)
+
+SAD_WXH_NEON(32, 8)
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 16)
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+SAD_WXH_NEON(64, 128)
+
+SAD_WXH_NEON(128, 64)
+SAD_WXH_NEON(128, 128)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int aom_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+  }
 
-#undef FSADS128_H
+SAD_SKIP_WXH_NEON(4, 8)
+SAD_SKIP_WXH_NEON(4, 16)
+
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+SAD_SKIP_WXH_NEON(8, 32)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+SAD_SKIP_WXH_NEON(16, 64)
+
+SAD_SKIP_WXH_NEON(32, 8)
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 16)
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+SAD_SKIP_WXH_NEON(64, 128)
+
+SAD_SKIP_WXH_NEON(128, 64)
+SAD_SKIP_WXH_NEON(128, 128)
+
+#undef SAD_SKIP_WXH_NEON
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
 
-#define FSADS64_H(h)                                                          \
-  unsigned int aom_sad_skip_64x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
 
-FSADS64_H(128)
-FSADS64_H(64)
-FSADS64_H(32)
-FSADS64_H(16)
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
 
-#undef FSADS64_H
+static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
+                         second_pred);
+}
 
-#define FSADS32_H(h)                                                          \
-  unsigned int aom_sad_skip_32x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                         second_pred);
+}
 
-FSADS32_H(64)
-FSADS32_H(32)
-FSADS32_H(16)
-FSADS32_H(8)
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                         second_pred);
+}
 
-#undef FSADS32_H
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-#define FSADS16_H(h)                                                          \
-  unsigned int aom_sad_skip_16x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
 
-FSADS16_H(64)
-FSADS16_H(32)
-FSADS16_H(16)
-FSADS16_H(8)
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
 
-#undef FSADS16_H
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
 
-#define FSADS8_H(h)                                                          \
-  unsigned int aom_sad_skip_8x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                           h / 2);                                           \
-  }
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             const uint8_t *second_pred) {
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+    uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7;
+    uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7;
+    uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    s4 = vld1q_u8(src_ptr + 64);
+    r4 = vld1q_u8(ref_ptr + 64);
+    p4 = vld1q_u8(second_pred + 64);
+    avg4 = vrhaddq_u8(r4, p4);
+    diff4 = vabdq_u8(s4, avg4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    s5 = vld1q_u8(src_ptr + 80);
+    r5 = vld1q_u8(ref_ptr + 80);
+    p5 = vld1q_u8(second_pred + 80);
+    avg5 = vrhaddq_u8(r5, p5);
+    diff5 = vabdq_u8(s5, avg5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    s6 = vld1q_u8(src_ptr + 96);
+    r6 = vld1q_u8(ref_ptr + 96);
+    p6 = vld1q_u8(second_pred + 96);
+    avg6 = vrhaddq_u8(r6, p6);
+    diff6 = vabdq_u8(s6, avg6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    s7 = vld1q_u8(src_ptr + 112);
+    r7 = vld1q_u8(ref_ptr + 112);
+    p7 = vld1q_u8(second_pred + 112);
+    avg7 = vrhaddq_u8(r7, p7);
+    diff7 = vabdq_u8(s7, avg7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+    uint8x16_t diff0 = vabdq_u8(s0, avg0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+    uint8x16_t diff1 = vabdq_u8(s1, avg1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
 
-FSADS8_H(32)
-FSADS8_H(16)
-FSADS8_H(8)
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-#undef FSADS8_H
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
 
-#define FSADS4_H(h)                                                          \
-  unsigned int aom_sad_skip_4x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                           h / 2);                                           \
+    uint8x16_t avg = vrhaddq_u8(r, p);
+    uint8x16_t diff = vabdq_u8(s, avg);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint32x2_t s, r;
+    uint32_t s0, s1, r0, r1;
+    uint8x8_t p, avg;
+
+    memcpy(&s0, src_ptr, 4);
+    memcpy(&r0, ref_ptr, 4);
+    s = vdup_n_u32(s0);
+    r = vdup_n_u32(r0);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    memcpy(&s1, src_ptr, 4);
+    memcpy(&r1, ref_ptr, 4);
+    s = vset_lane_u32(s1, s, 1);
+    r = vset_lane_u32(r1, r, 1);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    p = vld1_u8(second_pred);
+    avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
+
+    sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h)                                                 \
+  unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref, int ref_stride, \
+                                           const uint8_t *second_pred) {       \
+    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),          \
+                               second_pred);                                   \
   }
 
-FSADS4_H(16)
-FSADS4_H(8)
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+SAD_WXH_AVG_NEON(4, 16)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+SAD_WXH_AVG_NEON(8, 32)
+
+SAD_WXH_AVG_NEON(16, 4)
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+SAD_WXH_AVG_NEON(16, 64)
+
+SAD_WXH_AVG_NEON(32, 8)
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 16)
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+SAD_WXH_AVG_NEON(64, 128)
+
+SAD_WXH_AVG_NEON(128, 64)
+SAD_WXH_AVG_NEON(128, 128)
 
-#undef FSADS4_H
+#undef SAD_WXH_AVG_NEON
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index a69dfb53a..2c988dc4c 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -16,141 +16,310 @@
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 
-static INLINE void sse_w16_neon(uint32x4_t *sum, const uint8_t *a,
-                                const uint8_t *b) {
-  const uint8x16_t v_a0 = vld1q_u8(a);
-  const uint8x16_t v_b0 = vld1q_u8(b);
-  const uint8x16_t diff = vabdq_u8(v_a0, v_b0);
-  const uint8x8_t diff_lo = vget_low_u8(diff);
-  const uint8x8_t diff_hi = vget_high_u8(diff);
-  *sum = vpadalq_u16(*sum, vmull_u8(diff_lo, diff_lo));
-  *sum = vpadalq_u16(*sum, vmull_u8(diff_hi, diff_hi));
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
 }
-static INLINE void aom_sse4x2_neon(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   uint32x4_t *sum) {
-  uint8x8_t v_a0, v_b0;
-  v_a0 = v_b0 = vcreate_u8(0);
-  // above line is only to shadow [-Werror=uninitialized]
-  v_a0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)a, vreinterpret_u32_u8(v_a0), 0));
-  v_a0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)(a + a_stride), vreinterpret_u32_u8(v_a0), 1));
-  v_b0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)b, vreinterpret_u32_u8(v_b0), 0));
-  v_b0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)(b + b_stride), vreinterpret_u32_u8(v_b0), 1));
-  const uint8x8_t v_a_w = vabd_u8(v_a0, v_b0);
-  *sum = vpadalq_u16(*sum, vmull_u8(v_a_w, v_a_w));
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
 }
-static INLINE void aom_sse8_neon(const uint8_t *a, const uint8_t *b,
-                                 uint32x4_t *sum) {
-  const uint8x8_t v_a_w = vld1_u8(a);
-  const uint8x8_t v_b_w = vld1_u8(b);
-  const uint8x8_t v_d_w = vabd_u8(v_a_w, v_b_w);
-  *sum = vpadalq_u16(*sum, vmull_u8(v_d_w, v_d_w));
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
 }
-int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int width, int height) {
-  int y = 0;
-  int64_t sse = 0;
-  uint32x4_t sum = vdupq_n_u32(0);
-  switch (width) {
-    case 4:
-      do {
-        aom_sse4x2_neon(a, a_stride, b, b_stride, &sum);
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
-    case 8:
-      do {
-        aom_sse8_neon(a, b, &sum);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
-    case 16:
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x2(sse);
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
       do {
-        sse_w16_neon(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
-    case 32:
+        sse_8x1_neon(src + j, ref + j, &sse[0]);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
       do {
-        sse_w16_neon(&sum, a, b);
-        sse_w16_neon(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
-    case 64:
+        sse_8x1_neon(src + j, ref + j, &sse[0]);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x4_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x4_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
       do {
-        sse_w16_neon(&sum, a, b);
-        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
-    case 128:
+        sse_8x1_neon(src + j, ref + j, &sse);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
       do {
-        sse_w16_neon(&sum, a, b);
-        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
-        sse_w16_neon(&sum, a + 16 * 4, b + 16 * 4);
-        sse_w16_neon(&sum, a + 16 * 5, b + 16 * 5);
-        sse_w16_neon(&sum, a + 16 * 6, b + 16 * 6);
-        sse_w16_neon(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
+        sse_8x1_neon(src + j, ref + j, &sse);
+        j += 8;
+      } while (j < width);
+
+      src += src_stride;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+  return horizontal_add_u32x4(sse);
+}
+
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+    sse_16x1_neon(src + 64, ref + 64, &sse[0]);
+    sse_16x1_neon(src + 80, ref + 80, &sse[1]);
+    sse_16x1_neon(src + 96, ref + 96, &sse[0]);
+    sse_16x1_neon(src + 112, ref + 112, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int width, int height) {
+  switch (width) {
+    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
     default:
-      if (width & 0x07) {
-        do {
-          int i = 0;
-          do {
-            aom_sse8_neon(a + i, b + i, &sum);
-            aom_sse8_neon(a + i + a_stride, b + i + b_stride, &sum);
-            i += 8;
-          } while (i + 4 < width);
-          aom_sse4x2_neon(a + i, a_stride, b + i, b_stride, &sum);
-          a += (a_stride << 1);
-          b += (b_stride << 1);
-          y += 2;
-        } while (y < height);
-      } else {
-        do {
-          int i = 0;
-          do {
-            aom_sse8_neon(a + i, b + i, &sum);
-            i += 8;
-          } while (i < width);
-          a += a_stride;
-          b += b_stride;
-          y += 1;
-        } while (y < height);
-      }
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-      break;
+      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
   }
-  return sse;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 4ecf891cb..a05886066 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -17,424 +17,535 @@
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
-#include "aom_dsp/aom_filter.h"
 #include "aom_dsp/variance.h"
-
-// Load 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
-  uint32_t a;
-  uint32x2_t a_u32 = vdup_n_u32(0);
-  if (stride == 4) return vld1_u8(buf);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1_lane_u32(&a, a_u32, 0);
-  memcpy(&a, buf, 4);
-  a_u32 = vld1_lane_u32(&a, a_u32, 1);
-  return vreinterpret_u8_u32(a_u32);
+#include "aom_dsp/arm/mem_neon.h"
+
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
 }
 
-// Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; i += 2) {
-    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
-    const uint8x8_t src_1 =
-        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += 2 * src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
 }
 
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      unsigned int output_width,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                         uint8_t *dst_ptr, int src_stride,
+                                         int pixel_step, int dst_width,
+                                         int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+      vst1q_u8(dst_ptr + j, blend_u8);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-// Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
-    }
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
 }
 
-unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *dst, int dst_stride,
-                                            unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
-                            bilinear_filters_2t[yoffset]);
-  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
 }
 
-unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
 }
 
-unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
+                                        uint8_t *dst_ptr, int src_stride,
+                                        int pixel_step, int dst_height,
+                                        int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
+                               dst_height, filter_offset);
 }
 
-unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-unsigned int aom_sub_pixel_variance4x4_neon(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  uint8_t temp0[4 * (4 + 2)];
-  uint8_t temp1[4 * 4];
-
-  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (4 + 2),
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 4,
-                            bilinear_filters_2t[yoffset]);
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
+    uint8_t tmp0[w * (h + padding)];                                     \
+    uint8_t tmp1[w * h];                                                 \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                xoffset);                                \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+  }
 
-  return aom_variance4x4(temp1, 4, b, b_stride, sse);
-}
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
+    if (xoffset == 0) {                                                       \
+      if (yoffset == 0) {                                                     \
+        return aom_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      } else {                                                                \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
+                                    yoffset);                                 \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      }                                                                       \
+    } else if (xoffset == 4) {                                                \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    } else {                                                                  \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    }                                                                         \
+  }
 
-unsigned int aom_sub_pixel_variance4x8_neon(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  uint8_t temp0[4 * (8 + 2)];
-  uint8_t temp1[4 * 8];
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
 
-  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (8 + 2),
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 8,
-                            bilinear_filters_2t[yoffset]);
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
 
-  return aom_variance4x8(temp1, 4, b, b_stride, sse);
-}
+SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
 
-unsigned int aom_sub_pixel_variance8x4_neon(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  uint8_t temp0[8 * (4 + 1)];
-  uint8_t temp1[8 * 4];
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 
-  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (4 + 1), 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 4, 8,
-                            bilinear_filters_2t[yoffset]);
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
 
-  return aom_variance8x4(temp1, 8, b, b_stride, sse);
-}
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
 
-unsigned int aom_sub_pixel_variance8x16_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[8 * (16 + 1)];
-  uint8_t temp1[8 * 16];
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 
-  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (16 + 1), 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 16, 8,
-                            bilinear_filters_2t[yoffset]);
+SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
 
-  return aom_variance8x16(temp1, 8, b, b_stride, sse);
-}
+SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
 
-unsigned int aom_sub_pixel_variance16x8_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[16 * (8 + 1)];
-  uint8_t temp1[16 * 8];
+SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 8, 16,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance16x8(temp1, 16, b, b_stride, sse);
-}
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
 
-unsigned int aom_sub_pixel_variance16x32_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[16 * (32 + 1)];
-  uint8_t temp1[16 * 32];
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 32, 16,
-                             bilinear_filters_2t[yoffset]);
+#endif  // !CONFIG_REALTIME_ONLY
 
-  return aom_variance16x32(temp1, 16, b, b_stride, sse);
+#undef SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
 }
 
-unsigned int aom_sub_pixel_variance32x16_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[32 * (16 + 1)];
-  uint8_t temp1[32 * 16];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 16, 32,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance32x16(temp1, 32, b, b_stride, sse);
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
 }
 
-unsigned int aom_sub_pixel_variance32x64_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[32 * (64 + 1)];
-  uint8_t temp1[32 * 64];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 64, 32,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance32x64(temp1, 32, b, b_stride, sse);
+// Combine bilinear filter with aom_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-unsigned int aom_sub_pixel_variance64x32_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[64 * (32 + 1)];
-  uint8_t temp1[64 * 32];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 32, 64,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance64x32(temp1, 64, b, b_stride, sse);
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 16, dst_height,
+                                        filter_offset, second_pred);
 }
 
-unsigned int aom_sub_pixel_variance64x128_neon(const uint8_t *a, int a_stride,
-                                               int xoffset, int yoffset,
-                                               const uint8_t *b, int b_stride,
-                                               uint32_t *sse) {
-  uint8_t temp0[64 * (128 + 1)];
-  uint8_t temp1[64 * 128];
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 32, dst_height,
+                                        filter_offset, second_pred);
+}
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 128, 64,
-                             bilinear_filters_2t[yoffset]);
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 64, dst_height,
+                                        filter_offset, second_pred);
+}
 
-  return aom_variance64x128(temp1, 64, b, b_stride, sse);
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void avg_pred_var_filter_block2d_bil_w128(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 128, dst_height,
+                                        filter_offset, second_pred);
 }
 
-unsigned int aom_sub_pixel_variance128x64_neon(const uint8_t *a, int a_stride,
-                                               int xoffset, int yoffset,
-                                               const uint8_t *b, int b_stride,
-                                               uint32_t *sse) {
-  uint8_t temp0[128 * (64 + 1)];
-  uint8_t temp1[128 * 64];
+// Combine averaging subpel filter with aom_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+                                            uint8_t *dst_ptr, int src_stride,
+                                            int pixel_step, int dst_width,
+                                            int dst_height,
+                                            const uint8_t *second_pred) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      avg = vrhaddq_u8(avg, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 128,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 64, 128,
-                             bilinear_filters_2t[yoffset]);
+// Implementation of aom_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+                     int dst_width, int dst_height,
+                     const uint8_t *second_pred) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
 
-  return aom_variance128x64(temp1, 128, b, b_stride, sse);
-}
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
 
-unsigned int aom_sub_pixel_variance128x128_neon(const uint8_t *a, int a_stride,
-                                                int xoffset, int yoffset,
-                                                const uint8_t *b, int b_stride,
-                                                uint32_t *sse) {
-  uint8_t temp0[128 * (128 + 1)];
-  uint8_t temp1[128 * 128];
+      uint8x16_t avg = vrhaddq_u8(s, p);
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 128,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 128, 128,
-                             bilinear_filters_2t[yoffset]);
+      vst1q_u8(dst_ptr + j, avg);
 
-  return aom_variance128x128(temp1, 128, b, b_stride, sse);
-}
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
 
-// Realtime mode doesn't use 4x rectangular blocks.
-#if !CONFIG_REALTIME_ONLY
-unsigned int aom_sub_pixel_variance4x16_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[4 * (16 + 2)];
-  uint8_t temp1[4 * 16];
-
-  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (16 + 2),
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 16,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance4x16(temp1, 4, b, b_stride, sse);
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-unsigned int aom_sub_pixel_variance8x32_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[8 * (32 + 1)];
-  uint8_t temp1[8 * 32];
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                         \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
+                                         second_pred);                      \
+    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
 
-  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (32 + 1), 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 32, 8,
-                            bilinear_filters_2t[yoffset]);
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                     \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      } else if (yoffset == 4) {                                               \
+        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
+                                        source_stride, w, h, second_pred);     \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      } else {                                                                 \
+        avg_pred_var_filter_block2d_bil_w##w(                                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
+                                        second_pred);                          \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
+                                             xoffset, second_pred);            \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    }                                                                          \
+  }
 
-  return aom_variance8x32(temp1, 8, b, b_stride, sse);
-}
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
 
-unsigned int aom_sub_pixel_variance16x4_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[16 * (4 + 1)];
-  uint8_t temp1[16 * 4];
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (4 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 4, 16,
-                             bilinear_filters_2t[yoffset]);
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
 
-  return aom_variance16x4(temp1, 16, b, b_stride, sse);
-}
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
 
-unsigned int aom_sub_pixel_variance64x16_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[64 * (16 + 1)];
-  uint8_t temp1[64 * 16];
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 16, 64,
-                             bilinear_filters_2t[yoffset]);
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
 
-  return aom_variance64x16(temp1, 64, b, b_stride, sse);
-}
+#if !CONFIG_REALTIME_ONLY
 
-unsigned int aom_sub_pixel_variance16x64_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[16 * (64 + 1)];
-  uint8_t temp1[16 * 64];
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 64, 16,
-                             bilinear_filters_2t[yoffset]);
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
 
-  return aom_variance16x64(temp1, 16, b, b_stride, sse);
-}
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
 
-unsigned int aom_sub_pixel_variance32x8_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[32 * (8 + 1)];
-  uint8_t temp1[32 * 8];
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
 
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 8, 32,
-                             bilinear_filters_2t[yoffset]);
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
 
-  return aom_variance32x8(temp1, 32, b, b_stride, sse);
-}
 #endif  // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index a118f3c15..855edf667 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -37,6 +37,34 @@ static INLINE int horizontal_add_s32x4(const int32x4_t a) {
 #endif
 }
 
+static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE unsigned int horizontal_add_u32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
 static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
                                                  const uint16x8_t vec_hi) {
 #if defined(__aarch64__)
@@ -66,6 +94,15 @@ static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
 #endif
 }
 
+static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif
+}
+
 static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
 #if defined(__aarch64__)
   return vaddlv_u16(a);
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index 0b7337a94..bf212a926 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -13,111 +13,83 @@
 #include <assert.h>
 
 #include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint32x4_t sum_squares_i16_4x4_neon(const int16_t *src,
-                                                  int stride) {
-  const int16x4_t v_val_01_lo = vld1_s16(src + 0 * stride);
-  const int16x4_t v_val_01_hi = vld1_s16(src + 1 * stride);
-  const int16x4_t v_val_23_lo = vld1_s16(src + 2 * stride);
-  const int16x4_t v_val_23_hi = vld1_s16(src + 3 * stride);
-  int32x4_t v_sq_01_d = vmull_s16(v_val_01_lo, v_val_01_lo);
-  v_sq_01_d = vmlal_s16(v_sq_01_d, v_val_01_hi, v_val_01_hi);
-  int32x4_t v_sq_23_d = vmull_s16(v_val_23_lo, v_val_23_lo);
-  v_sq_23_d = vmlal_s16(v_sq_23_d, v_val_23_hi, v_val_23_hi);
-#if defined(__aarch64__)
-  return vreinterpretq_u32_s32(vpaddq_s32(v_sq_01_d, v_sq_23_d));
-#else
-  return vreinterpretq_u32_s32(vcombine_s32(
-      vqmovn_s64(vpaddlq_s32(v_sq_01_d)), vqmovn_s64(vpaddlq_s32(v_sq_23_d))));
-#endif
-}
+static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
+                                                       int stride) {
+  int16x4_t s0 = vld1_s16(src + 0 * stride);
+  int16x4_t s1 = vld1_s16(src + 1 * stride);
+  int16x4_t s2 = vld1_s16(src + 2 * stride);
+  int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+  int32x4_t sum_squares = vmull_s16(s0, s0);
+  sum_squares = vmlal_s16(sum_squares, s1, s1);
+  sum_squares = vmlal_s16(sum_squares, s2, s2);
+  sum_squares = vmlal_s16(sum_squares, s3, s3);
 
-uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, int stride) {
-  const uint32x4_t v_sum_0123_d = sum_squares_i16_4x4_neon(src, stride);
-#if defined(__aarch64__)
-  return (uint64_t)vaddvq_u32(v_sum_0123_d);
-#else
-  uint64x2_t v_sum_d = vpaddlq_u32(v_sum_0123_d);
-  v_sum_d = vaddq_u64(v_sum_d, vextq_u64(v_sum_d, v_sum_d, 1));
-  return vgetq_lane_u64(v_sum_d, 0);
-#endif
+  return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares));
 }
 
-uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, int stride,
-                                         int height) {
-  int r = 0;
-  uint32x4_t v_acc_q = vdupq_n_u32(0);
+static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
+                                                       int stride, int height) {
+  int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int h = 0;
   do {
-    const uint32x4_t v_acc_d = sum_squares_i16_4x4_neon(src, stride);
-    v_acc_q = vaddq_u32(v_acc_q, v_acc_d);
-    src += stride << 2;
-    r += 4;
-  } while (r < height);
-
-  uint64x2_t v_acc_64 = vpaddlq_u32(v_acc_q);
-#if defined(__aarch64__)
-  return vaddvq_u64(v_acc_64);
-#else
-  v_acc_64 = vaddq_u64(v_acc_64, vextq_u64(v_acc_64, v_acc_64, 1));
-  return vgetq_lane_u64(v_acc_64, 0);
-#endif
+    int16x4_t s0 = vld1_s16(src + 0 * stride);
+    int16x4_t s1 = vld1_s16(src + 1 * stride);
+    int16x4_t s2 = vld1_s16(src + 2 * stride);
+    int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+    sum_squares[0] = vmlal_s16(sum_squares[0], s0, s0);
+    sum_squares[0] = vmlal_s16(sum_squares[0], s1, s1);
+    sum_squares[1] = vmlal_s16(sum_squares[1], s2, s2);
+    sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3);
+
+    src += 4 * stride;
+    h += 4;
+  } while (h < height);
+
+  return horizontal_long_add_u32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
 }
 
-uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, int stride,
-                                         int width, int height) {
-  int r = 0;
-  const int32x4_t zero = vdupq_n_s32(0);
-  uint64x2_t v_acc_q = vreinterpretq_u64_s32(zero);
+static INLINE uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src,
+                                                       int stride, int width,
+                                                       int height) {
+  uint64x2_t sum_squares = vdupq_n_u64(0);
+
+  int h = 0;
   do {
-    int32x4_t v_sum = zero;
-    int c = 0;
+    int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    int w = 0;
     do {
-      const int16_t *b = src + c;
-      const int16x8_t v_val_0 = vld1q_s16(b + 0 * stride);
-      const int16x8_t v_val_1 = vld1q_s16(b + 1 * stride);
-      const int16x8_t v_val_2 = vld1q_s16(b + 2 * stride);
-      const int16x8_t v_val_3 = vld1q_s16(b + 3 * stride);
-      const int16x4_t v_val_0_lo = vget_low_s16(v_val_0);
-      const int16x4_t v_val_1_lo = vget_low_s16(v_val_1);
-      const int16x4_t v_val_2_lo = vget_low_s16(v_val_2);
-      const int16x4_t v_val_3_lo = vget_low_s16(v_val_3);
-      int32x4_t v_sum_01 = vmull_s16(v_val_0_lo, v_val_0_lo);
-      v_sum_01 = vmlal_s16(v_sum_01, v_val_1_lo, v_val_1_lo);
-      int32x4_t v_sum_23 = vmull_s16(v_val_2_lo, v_val_2_lo);
-      v_sum_23 = vmlal_s16(v_sum_23, v_val_3_lo, v_val_3_lo);
-#if defined(__aarch64__)
-      v_sum_01 = vmlal_high_s16(v_sum_01, v_val_0, v_val_0);
-      v_sum_01 = vmlal_high_s16(v_sum_01, v_val_1, v_val_1);
-      v_sum_23 = vmlal_high_s16(v_sum_23, v_val_2, v_val_2);
-      v_sum_23 = vmlal_high_s16(v_sum_23, v_val_3, v_val_3);
-      v_sum = vaddq_s32(v_sum, vpaddq_s32(v_sum_01, v_sum_23));
-#else
-      const int16x4_t v_val_0_hi = vget_high_s16(v_val_0);
-      const int16x4_t v_val_1_hi = vget_high_s16(v_val_1);
-      const int16x4_t v_val_2_hi = vget_high_s16(v_val_2);
-      const int16x4_t v_val_3_hi = vget_high_s16(v_val_3);
-      v_sum_01 = vmlal_s16(v_sum_01, v_val_0_hi, v_val_0_hi);
-      v_sum_01 = vmlal_s16(v_sum_01, v_val_1_hi, v_val_1_hi);
-      v_sum_23 = vmlal_s16(v_sum_23, v_val_2_hi, v_val_2_hi);
-      v_sum_23 = vmlal_s16(v_sum_23, v_val_3_hi, v_val_3_hi);
-      v_sum = vaddq_s32(v_sum, vcombine_s32(vqmovn_s64(vpaddlq_s32(v_sum_01)),
-                                            vqmovn_s64(vpaddlq_s32(v_sum_23))));
-#endif
-      c += 8;
-    } while (c < width);
-
-    v_acc_q = vpadalq_u32(v_acc_q, vreinterpretq_u32_s32(v_sum));
+      const int16_t *s = src + w;
+      int16x8_t s0 = vld1q_s16(s + 0 * stride);
+      int16x8_t s1 = vld1q_s16(s + 1 * stride);
+      int16x8_t s2 = vld1q_s16(s + 2 * stride);
+      int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s0), vget_low_s16(s0));
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s1), vget_low_s16(s1));
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s2), vget_low_s16(s2));
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s3), vget_low_s16(s3));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s0), vget_high_s16(s0));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s1), vget_high_s16(s1));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s2), vget_high_s16(s2));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s3), vget_high_s16(s3));
+      w += 8;
+    } while (w < width);
+
+    sum_squares = vpadalq_u32(
+        sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1])));
 
     src += 4 * stride;
-    r += 4;
-  } while (r < height);
-#if defined(__aarch64__)
-  return vaddvq_u64(v_acc_q);
-#else
-  v_acc_q = vaddq_u64(v_acc_q, vextq_u64(v_acc_q, v_acc_q, 1));
-  return vgetq_lane_u64(v_acc_q, 0);
-#endif
+    h += 4;
+  } while (h < height);
+
+  return horizontal_add_u64x2(sum_squares);
 }
 
 uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
@@ -136,3 +108,118 @@ uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
     return aom_sum_squares_2d_i16_c(src, stride, width, height);
   }
 }
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
+                                                   int stride, int *sum) {
+  int16x4_t s0 = vld1_s16(src + 0 * stride);
+  int16x4_t s1 = vld1_s16(src + 1 * stride);
+  int16x4_t s2 = vld1_s16(src + 2 * stride);
+  int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+  int32x4_t sse = vmull_s16(s0, s0);
+  sse = vmlal_s16(sse, s1, s1);
+  sse = vmlal_s16(sse, s2, s2);
+  sse = vmlal_s16(sse, s3, s3);
+
+  int32x4_t sum_01 = vaddl_s16(s0, s1);
+  int32x4_t sum_23 = vaddl_s16(s2, s3);
+  *sum += horizontal_add_s32x4(vaddq_s32(sum_01, sum_23));
+
+  return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
+                                                   int stride, int height,
+                                                   int *sum) {
+  int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) };
+
+  int h = 0;
+  do {
+    int16x4_t s0 = vld1_s16(src + 0 * stride);
+    int16x4_t s1 = vld1_s16(src + 1 * stride);
+    int16x4_t s2 = vld1_s16(src + 2 * stride);
+    int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+    sse[0] = vmlal_s16(sse[0], s0, s0);
+    sse[0] = vmlal_s16(sse[0], s1, s1);
+    sse[1] = vmlal_s16(sse[1], s2, s2);
+    sse[1] = vmlal_s16(sse[1], s3, s3);
+
+    sum_acc[0] = vpadal_s16(sum_acc[0], s0);
+    sum_acc[0] = vpadal_s16(sum_acc[0], s1);
+    sum_acc[1] = vpadal_s16(sum_acc[1], s2);
+    sum_acc[1] = vpadal_s16(sum_acc[1], s3);
+
+    src += 4 * stride;
+    h += 4;
+  } while (h < height);
+
+  *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1]));
+  return horizontal_long_add_u32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1])));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src,
+                                                   int stride, int width,
+                                                   int height, int *sum) {
+  uint64x2_t sse = vdupq_n_u64(0);
+  int32x4_t sum_acc = vdupq_n_s32(0);
+
+  int h = 0;
+  do {
+    int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    int w = 0;
+    do {
+      const int16_t *s = src + w;
+      int16x8_t s0 = vld1q_s16(s + 0 * stride);
+      int16x8_t s1 = vld1q_s16(s + 1 * stride);
+      int16x8_t s2 = vld1q_s16(s + 2 * stride);
+      int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s0), vget_low_s16(s0));
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s1), vget_low_s16(s1));
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s2), vget_low_s16(s2));
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s3), vget_low_s16(s3));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s0), vget_high_s16(s0));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s1), vget_high_s16(s1));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s2), vget_high_s16(s2));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s3), vget_high_s16(s3));
+
+      sum_acc = vpadalq_s16(sum_acc, s0);
+      sum_acc = vpadalq_s16(sum_acc, s1);
+      sum_acc = vpadalq_s16(sum_acc, s2);
+      sum_acc = vpadalq_s16(sum_acc, s3);
+
+      w += 8;
+    } while (w < width);
+
+    sse = vpadalq_u32(sse,
+                      vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1])));
+
+    src += 4 * stride;
+    h += 4;
+  } while (h < height);
+
+  *sum += horizontal_add_s32x4(sum_acc);
+  return horizontal_add_u64x2(sse);
+}
+
+uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width,
+                                 int height, int *sum) {
+  uint64_t sse;
+
+  if (LIKELY(width == 4 && height == 4)) {
+    sse = aom_sum_sse_2d_i16_4x4_neon(src, stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    // width = 4, height is a multiple of 4.
+    sse = aom_sum_sse_2d_i16_4xn_neon(src, stride, height, sum);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case - width is multiple of 8, height is multiple of 4.
+    sse = aom_sum_sse_2d_i16_nxn_neon(src, stride, width, height, sum);
+  } else {
+    sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+  }
+
+  return sse;
+}
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 33784916a..40e40f07b 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -13,639 +13,706 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_config.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, uint32_t *sse,
-                             int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo =
-          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
-      v_sse_hi =
-          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
+#if defined(__ARM_FEATURE_DOTPROD)
 
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
+static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int h,
+                                     uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = 0;
+  do {
+    uint8x16_t s = load_unaligned_u8q(src, src_stride);
+    uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 4 * src_stride;
+    ref += 4 * ref_stride;
+    i += 4;
+  } while (i < h);
 
-void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                        int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
 }
 
-void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int h,
+                                     uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = 0;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i += 2;
+  } while (i < h);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
 }
 
-// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
-// AVX2.
-void aom_get_sse_sum_8x8_quad_neon(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   unsigned int *sse, int *sum) {
-  // Loop over 4 8x8 blocks. Process one 8x32 block.
-  for (int k = 0; k < 4; k++) {
-    variance_neon_w8(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8,
-                     &sse[k], &sum[k]);
-  }
+static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = 0;
+  do {
+    uint8x16_t s = vld1q_u8(src);
+    uint8x16_t r = vld1q_u8(ref);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+    i++;
+  } while (i < h);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
 }
 
-unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
+static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       int w, int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src + j);
+      uint8x16_t r = vld1q_u8(ref + j);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      j += 16;
+    } while (j < w);
+
+    src += src_stride;
+    ref += ref_stride;
+    i++;
+  } while (i < h);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
 }
 
-unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
 }
 
-unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
 }
 
-unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
-                   32, 32, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       int h, uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse, sum);
 }
 
-unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int h,
+                                     uint32_t *sse, int *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+  assert(h <= 256);
+
+  int i = 0;
+  do {
+    uint8x8_t s = load_unaligned_u8(src, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
 }
 
-unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
+static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int h,
+                                     uint32_t *sse, int *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128
+  assert(h <= 128);
+
+  int i = 0;
+  do {
+    uint8x8_t s = vld1_u8(src);
+    uint8x8_t r = vld1_u8(ref);
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+    src += src_stride;
+    ref += ref_stride;
+    i++;
+  } while (i < h);
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-unsigned int aom_variance128x128_neon(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
-                                      unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  sum1 = sse1 = 0;
-  for (int i = 0; i < 16; i++) {
-    variance_neon_w8(a + (8 * i * a_stride), a_stride, b + (8 * i * b_stride),
-                     b_stride, 128, 8, &sse2, &sum2);
-    sse1 += sse2;
-    sum1 += sum2;
-  }
+static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
-  *sse = sse1;
+  // Number of rows we can process before 'sum_s16' accumulators overflow:
+  // 32767 / 255 ~= 128, so 128 16-wide rows.
+  assert(h <= 128);
 
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 14);
+  int i = 0;
+  do {
+    uint8x16_t s = vld1q_u8(src);
+    uint8x16_t r = vld1q_u8(ref);
+
+    int16x8_t diff_l =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+    int16x8_t diff_h =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+    sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+    sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+    src += src_stride;
+    ref += ref_stride;
+    i++;
+  } while (i < h);
+
+  *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
+static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       int w, int h, int h_limit, uint32_t *sse,
+                                       int *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+  // accumulator overflows. After hitting this limit we accumulate into 32-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
 
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
+  int i = 0;
+  do {
+    int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+    do {
+      int j = 0;
+      do {
+        uint8x16_t s = vld1q_u8(src + j);
+        uint8x16_t r = vld1q_u8(ref + j);
+
+        int16x8_t diff_l =
+            vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+        int16x8_t diff_h =
+            vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+        sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+        sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+        j += 16;
+      } while (j < w);
+
+      src += src_stride;
+      ref += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+    h_tmp += h_limit;
+  } while (i < h);
 
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+  *sum = horizontal_add_s32x4(sum_s32);
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
 
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
 
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
 
-  return vget_lane_u32(d0u32, 0);
+static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       int h, uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
 }
 
-unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  uint8x8_t d0u8, d2u8, d4u8, d6u8;
-  int16x4_t d22s16, d23s16, d24s16, d25s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint16x8_t q11u16, q12u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d2u8, d6u8);
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+#define VARIANCE_WXH_NEON(w, h, shift)                                        \
+  unsigned int aom_variance##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
   }
 
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+VARIANCE_WXH_NEON(4, 16, 6)
 
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+VARIANCE_WXH_NEON(8, 32, 8)
 
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+VARIANCE_WXH_NEON(16, 4, 6)
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+VARIANCE_WXH_NEON(16, 64, 10)
 
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+VARIANCE_WXH_NEON(32, 8, 8)
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
 
-  return vget_lane_u32(d0u32, 0);
-}
+VARIANCE_WXH_NEON(64, 16, 10)
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+VARIANCE_WXH_NEON(64, 128, 13)
 
-unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
-                               const unsigned char *ref_ptr, int recon_stride,
-                               unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
+VARIANCE_WXH_NEON(128, 64, 13)
+VARIANCE_WXH_NEON(128, 128, 14)
 
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
+#undef VARIANCE_WXH_NEON
 
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+void aom_get8x8var_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                        int ref_stride, unsigned int *sse, int *sum) {
+  variance_8xh_neon(src, src_stride, ref, ref_stride, 8, sse, sum);
+}
 
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+void aom_get16x16var_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride, unsigned int *sse,
+                          int *sum) {
+  variance_16xh_neon(src, src_stride, ref, ref_stride, 16, sse, sum);
 }
 
-unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  d0u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d1u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d5u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d2u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d6u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d3u8 = vld1_u8(src_ptr);
-  d7u8 = vld1_u8(ref_ptr);
-
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
-
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
+// AVX2. Also, implement the NEON for variance computation present in this
+// function.
+void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       uint32_t *sse8x8, int *sum8x8,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var8x8) {
+  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
+                      &sse8x8[k], &sum8x8[k]);
+  }
+
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++)
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
 }
 
-// Load 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
-  uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
-  if (stride == 4) return vld1q_u8(buf);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
-  return vreinterpretq_u8_u32(a_u32);
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       unsigned int *sse, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = 0;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return horizontal_add_u32x4(sse_u32);
 }
 
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
-
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int h, uint32_t *sse, int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-  int32x4_t sse_s32 = zero;
-
-  // Since width is only 4, sum_s16 only loads a half row per loop.
-  assert(h <= 256);
+static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        unsigned int *sse, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-  int i;
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
-    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
-    const int16x8_t diff_lo_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
-    const int16x8_t diff_hi_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
-
-    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
-                        vget_low_s16(diff_lo_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
-                        vget_high_s16(diff_lo_s16));
-
-    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
-                        vget_low_s16(diff_hi_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
-                        vget_high_s16(diff_hi_s16));
-
-    a += 4 * a_stride;
-    b += 4 * b_stride;
-  }
+  int i = 0;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t s1 = vld1q_u8(src + src_stride);
+    uint8x16_t r0 = vld1q_u8(ref);
+    uint8x16_t r1 = vld1q_u8(ref + ref_stride);
 
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+    uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+    uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int w, int h, uint32_t *sse,
-                              int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-  int32x4_t sse_s32 = zero;
-
-  // The loop loads 16 values at a time but doubles them up when accumulating
-  // into sum_s16.
-  assert(w / 8 * h <= 128);
-
-  int i, j;
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a_u8 = vld1q_u8(a + j);
-      const uint8x16_t b_u8 = vld1q_u8(b + j);
-
-      const int16x8_t diff_lo_s16 =
-          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
-      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
-          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
-
-      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
-                          vget_low_s16(diff_lo_s16));
-      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
-                          vget_high_s16(diff_lo_s16));
-
-      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
-                          vget_low_s16(diff_hi_s16));
-      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
-                          vget_high_s16(diff_hi_s16));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
+unsigned int aom_get4x4sse_cs_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *ref, int ref_stride) {
+  uint8x16_t s = load_unaligned_u8q(src, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
 
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_u32x4(sse);
 }
 
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int h, uint32_t *sse, int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-  int32x4_t sse_s32 = zero;
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 
-  // Each column has it's own accumulator entry in sum_s16.
-  assert(h <= 128);
+static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       unsigned int *sse, int h) {
+  uint8x8_t s[2], r[2];
+  int16x4_t diff_lo[2], diff_hi[2];
+  uint16x8_t diff[2];
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
   int i = 0;
   do {
-    const uint8x8_t a_0_u8 = vld1_u8(a);
-    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
-    const uint8x8_t b_0_u8 = vld1_u8(b);
-    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
-    const int16x8_t diff_0_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(a_0_u8, b_0_u8));
-    const int16x8_t diff_1_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(a_1_u8, b_1_u8));
-    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
-    sse_s32 =
-        vmlal_s16(sse_s32, vget_low_s16(diff_0_s16), vget_low_s16(diff_0_s16));
-    sse_s32 =
-        vmlal_s16(sse_s32, vget_low_s16(diff_1_s16), vget_low_s16(diff_1_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_0_s16),
-                        vget_high_s16(diff_0_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_1_s16),
-                        vget_high_s16(diff_1_s16));
-    a += a_stride + a_stride;
-    b += b_stride + b_stride;
+    s[0] = vld1_u8(src);
+    src += src_stride;
+    s[1] = vld1_u8(src);
+    src += src_stride;
+    r[0] = vld1_u8(ref);
+    ref += ref_stride;
+    r[1] = vld1_u8(ref);
+    ref += ref_stride;
+
+    diff[0] = vsubl_u8(s[0], r[0]);
+    diff[1] = vsubl_u8(s[1], r[1]);
+
+    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
     i += 2;
   } while (i < h);
 
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+  sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+
+  *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+  return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
 }
 
-#define VARIANCE_NXM(n, m, shift)                                           \
-  unsigned int aom_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
-                                            const uint8_t *b, int b_stride, \
-                                            unsigned int *sse) {            \
-    int sum;                                                                \
-    if (n == 4)                                                             \
-      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
-    else if (n == 8)                                                        \
-      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
-    else                                                                    \
-      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
-    if (n * m < 16 * 16)                                                    \
-      return *sse - ((sum * sum) >> shift);                                 \
-    else                                                                    \
-      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
-  }
+static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        unsigned int *sse, int h) {
+  uint8x16_t s[2], r[2];
+  int16x4_t diff_lo[4], diff_hi[4];
+  uint16x8_t diff[4];
+  int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
 
-static void variance_neon_wide_block(const uint8_t *a, int a_stride,
-                                     const uint8_t *b, int b_stride, int w,
-                                     int h, uint32_t *sse, int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int32x4_t v_diff = zero;
-  int64x2_t v_sse = vreinterpretq_s64_s32(zero);
-
-  int s, i, j;
-  for (s = 0; s < 16; s++) {
-    int32x4_t sse_s32 = zero;
-    int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-    for (i = (s * h) >> 4; i < (((s + 1) * h) >> 4); ++i) {
-      for (j = 0; j < w; j += 16) {
-        const uint8x16_t a_u8 = vld1q_u8(a + j);
-        const uint8x16_t b_u8 = vld1q_u8(b + j);
-
-        const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(
-            vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
-        const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
-            vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
-
-        sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-        sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
-                            vget_low_s16(diff_lo_s16));
-        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
-                            vget_high_s16(diff_lo_s16));
-        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
-                            vget_low_s16(diff_hi_s16));
-        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
-                            vget_high_s16(diff_hi_s16));
-      }
-
-      a += a_stride;
-      b += b_stride;
-    }
-
-    v_diff = vpadalq_s16(v_diff, sum_s16);
-    v_sse = vpadalq_s32(v_sse, sse_s32);
-  }
-  int diff = horizontal_add_s32x4(v_diff);
-#if defined(__aarch64__)
-  uint32_t sq = (uint32_t)vaddvq_u64(vreinterpretq_u64_s64(v_sse));
-#else
-  uint32_t sq = vget_lane_u32(
-      vreinterpret_u32_s64(vadd_s64(vget_low_s64(v_sse), vget_high_s64(v_sse))),
-      0);
-#endif
-
-  *sum = diff;
-  *sse = sq;
+  int i = 0;
+  do {
+    s[0] = vld1q_u8(src);
+    src += src_stride;
+    s[1] = vld1q_u8(src);
+    src += src_stride;
+    r[0] = vld1q_u8(ref);
+    ref += ref_stride;
+    r[1] = vld1q_u8(ref);
+    ref += ref_stride;
+
+    diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0]));
+    diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0]));
+    diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1]));
+    diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1]));
+
+    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+    diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
+    diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
+    sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]);
+    sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]);
+
+    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+    diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
+    diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
+    sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]);
+    sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]);
+
+    i += 2;
+  } while (i < h);
+
+  sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+  sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]);
+  sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]);
+
+  *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+  return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+unsigned int aom_get4x4sse_cs_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *ref, int ref_stride) {
+  uint8x8_t s[4], r[4];
+  int16x4_t diff[4];
+  int32x4_t sse;
+
+  s[0] = vld1_u8(src);
+  src += src_stride;
+  r[0] = vld1_u8(ref);
+  ref += ref_stride;
+  s[1] = vld1_u8(src);
+  src += src_stride;
+  r[1] = vld1_u8(ref);
+  ref += ref_stride;
+  s[2] = vld1_u8(src);
+  src += src_stride;
+  r[2] = vld1_u8(ref);
+  ref += ref_stride;
+  s[3] = vld1_u8(src);
+  r[3] = vld1_u8(ref);
+
+  diff[0] = vget_low_s16(vreinterpretq_s16_u16(vsubl_u8(s[0], r[0])));
+  diff[1] = vget_low_s16(vreinterpretq_s16_u16(vsubl_u8(s[1], r[1])));
+  diff[2] = vget_low_s16(vreinterpretq_s16_u16(vsubl_u8(s[2], r[2])));
+  diff[3] = vget_low_s16(vreinterpretq_s16_u16(vsubl_u8(s[3], r[3])));
+
+  sse = vmull_s16(diff[0], diff[0]);
+  sse = vmlal_s16(sse, diff[1], diff[1]);
+  sse = vmlal_s16(sse, diff[2], diff[2]);
+  sse = vmlal_s16(sse, diff[3], diff[3]);
+
+  return horizontal_add_u32x4(vreinterpretq_u32_s32(sse));
 }
 
-#define VARIANCE_NXM_WIDE(W, H)                                             \
-  unsigned int aom_variance##W##x##H##_neon(const uint8_t *a, int a_stride, \
-                                            const uint8_t *b, int b_stride, \
-                                            uint32_t *sse) {                \
-    int sum;                                                                \
-    variance_neon_wide_block(a, a_stride, b, b_stride, W, H, sse, &sum);    \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+#define MSE_WXH_NEON(w, h)                                                 \
+  unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse) {                \
+    return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h);      \
   }
 
-VARIANCE_NXM(4, 4, 4)
-VARIANCE_NXM(4, 8, 5)
-VARIANCE_NXM(8, 4, 5)
-VARIANCE_NXM(16, 32, 9)
-VARIANCE_NXM(32, 16, 9)
-VARIANCE_NXM_WIDE(128, 64)
-VARIANCE_NXM_WIDE(64, 128)
+MSE_WXH_NEON(8, 8)
+MSE_WXH_NEON(8, 16)
+
+MSE_WXH_NEON(16, 8)
+MSE_WXH_NEON(16, 16)
+
+#undef MSE_WXH_NEON
+
+#define COMPUTE_MSE_16BIT(src_16x8, dst_16x8)                           \
+  /* r7 r6 r5 r4 r3 r2 r1 r0 - 16 bit */                                \
+  const uint16x8_t diff = vabdq_u16(src_16x8, dst_16x8);                \
+  /*r3 r2 r1 r0 - 16 bit */                                             \
+  const uint16x4_t res0_low_16x4 = vget_low_u16(diff);                  \
+  /*r7 r6 r5 r4 - 16 bit */                                             \
+  const uint16x4_t res0_high_16x4 = vget_high_u16(diff);                \
+  /* (r3*r3)= b3 (r2*r2)= b2 (r1*r1)= b1 (r0*r0)= b0 - 32 bit */        \
+  const uint32x4_t res0_32x4 = vmull_u16(res0_low_16x4, res0_low_16x4); \
+  /* (r7*r7)= b7 (r6*r6)= b6 (r5*r5)= b5 (r4*r4)= b4 - 32 bit*/         \
+  /* b3+b7 b2+b6 b1+b5 b0+b4 - 32 bit*/                                 \
+  const uint32x4_t res_32x4 =                                           \
+      vmlal_u16(res0_32x4, res0_high_16x4, res0_high_16x4);             \
+                                                                        \
+  /*a1 a0 - 64 bit*/                                                    \
+  const uint64x2_t vl = vpaddlq_u32(res_32x4);                          \
+  /*a1+a2= f1 a3+a0= f0*/                                               \
+  square_result = vaddq_u64(square_result, vl);
+
+static AOM_INLINE uint64_t mse_4xh_16bit_neon(uint8_t *dst, int dstride,
+                                              uint16_t *src, int sstride,
+                                              int h) {
+  uint64x2_t square_result = vdupq_n_u64(0);
+  uint32_t d0, d1;
+  int i = 0;
+  uint8_t *dst_ptr = dst;
+  uint16_t *src_ptr = src;
+  do {
+    // d03 d02 d01 d00 - 8 bit
+    memcpy(&d0, dst_ptr, 4);
+    dst_ptr += dstride;
+    // d13 d12 d11 d10 - 8 bit
+    memcpy(&d1, dst_ptr, 4);
+    dst_ptr += dstride;
+    // duplication
+    uint8x8_t tmp0_8x8 = vreinterpret_u8_u32(vdup_n_u32(d0));
+    // d03 d02 d01 d00 - 16 bit
+    const uint16x4_t dst0_16x4 = vget_low_u16(vmovl_u8(tmp0_8x8));
+    // duplication
+    tmp0_8x8 = vreinterpret_u8_u32(vdup_n_u32(d1));
+    // d13 d12 d11 d10 - 16 bit
+    const uint16x4_t dst1_16x4 = vget_low_u16(vmovl_u8(tmp0_8x8));
+    // d13 d12 d11 d10 d03 d02 d01 d00 - 16 bit
+    const uint16x8_t dst_16x8 = vcombine_u16(dst0_16x4, dst1_16x4);
+
+    // b1r0 - s03 s02 s01 s00 - 16 bit
+    const uint16x4_t src0_16x4 = vld1_u16(src_ptr);
+    src_ptr += sstride;
+    // b1r1 - s13 s12 s11 s10 - 16 bit
+    const uint16x4_t src1_16x4 = vld1_u16(src_ptr);
+    src_ptr += sstride;
+    // s13 s12 s11 s10 s03 s02 s01 s00 - 16 bit
+    const uint16x8_t src_16x8 = vcombine_u16(src0_16x4, src1_16x4);
+
+    COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
+    i += 2;
+  } while (i < h);
+  uint64x1_t sum =
+      vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
+  return vget_lane_u64(sum, 0);
+}
+
+static AOM_INLINE uint64_t mse_8xh_16bit_neon(uint8_t *dst, int dstride,
+                                              uint16_t *src, int sstride,
+                                              int h) {
+  uint64x2_t square_result = vdupq_n_u64(0);
+  int i = 0;
+  do {
+    // d7 d6 d5 d4 d3 d2 d1 d0 - 8 bit
+    const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(&dst[i * dstride]));
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 16 bit
+    const uint16x8_t src_16x8 = vld1q_u16(&src[i * sstride]);
+
+    COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
+    i++;
+  } while (i < h);
+  uint64x1_t sum =
+      vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
+  return vget_lane_u64(sum, 0);
+}
+
+// Computes mse for a given block size. This function gets called for specific
+// block sizes, which are 8x8, 8x4, 4x8 and 4x4.
+uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return mse_4xh_16bit_neon(dst, dstride, src, sstride, h);
+    case 8: return mse_8xh_16bit_neon(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 1e48bc1e7..ceb102679 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -142,7 +142,12 @@ void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride,
     ++tmp_buf;
   }
 
-  for (idx = 0; idx < 16; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2).
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i];
+    }
+  }
 }
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
@@ -177,8 +182,6 @@ static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
   coeff[5] = c3 - c7;
 }
 
-// The order of the output coeff of the hadamard is not important. For
-// optimization purposes the final transpose may be skipped.
 void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
                         tran_low_t *coeff) {
   int idx;
@@ -201,7 +204,12 @@ void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
     ++tmp_buf;
   }
 
-  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2).
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i];
+    }
+  }
 }
 
 void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -226,6 +234,13 @@ void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
   }
 
   for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
+
+  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2).
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      coeff[i * 8 + j] = buffer2[j * 8 + i];
+    }
+  }
 }
 
 void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -266,6 +281,17 @@ void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
 
     ++coeff;
   }
+
+  coeff -= 64;
+  // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2).
+  // Note that to match SSE2 output, it does not need this step.
+  for (int i = 0; i < 16; i++) {
+    for (int j = 0; j < 4; j++) {
+      tran_low_t temp = coeff[i * 16 + 4 + j];
+      coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j];
+      coeff[i * 16 + 8 + j] = temp;
+    }
+  }
 }
 
 void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -507,35 +533,35 @@ int aom_satd_lp_c(const int16_t *coeff, int length) {
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64, 128}.
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
-                       const int ref_stride, const int height) {
-  int idx;
-  const int norm_factor = height >> 1;
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
+                       const int width, const int height, int norm_factor) {
   assert(height >= 2);
-  for (idx = 0; idx < 16; ++idx) {
-    int i;
+  for (int idx = 0; idx < width; ++idx) {
     hbuf[idx] = 0;
     // hbuf[idx]: 14 bit, dynamic range [0, 32640].
-    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+    for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
     // hbuf[idx]: 9 bit, dynamic range [0, 1020].
-    hbuf[idx] /= norm_factor;
+    hbuf[idx] >>= norm_factor;
     ++ref;
   }
 }
 
 // width: value range {16, 32, 64, 128}.
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
-  int idx;
-  int16_t sum = 0;
-  // sum: 14 bit, dynamic range [0, 32640]
-  for (idx = 0; idx < width; ++idx) sum += ref[idx];
-  return sum;
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
+                       const int width, const int height, int norm_factor) {
+  for (int ht = 0; ht < height; ++ht) {
+    int16_t sum = 0;
+    // sum: 14 bit, dynamic range [0, 32640]
+    for (int idx = 0; idx < width; ++idx) sum += ref[idx];
+    vbuf[ht] = sum >> norm_factor;
+    ref += ref_stride;
+  }
 }
 
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4, 5}
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
   int i;
   int width = 4 << bwl;
   int sse = 0, mean = 0, var;
@@ -547,6 +573,9 @@ int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
   }
 
   // (mean * mean): dynamic range 31 bits.
-  var = sse - ((mean * mean) >> (bwl + 2));
+  // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
+  // 31.99, so it needs to be casted to unsigned int to compute its square.
+  const unsigned int mean_abs = abs(mean);
+  var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
   return var;
 }
diff --git a/av1/encoder/corner_detect.c b/aom_dsp/flow_estimation/corner_detect.c
index 597bb30fc..c49e3fa75 100644
--- a/av1/encoder/corner_detect.c
+++ b/aom_dsp/flow_estimation/corner_detect.c
@@ -17,7 +17,7 @@
 
 #include "third_party/fastfeat/fast.h"
 
-#include "av1/encoder/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
 
 // Fast_9 wrapper
 #define FAST_BARRIER 18
diff --git a/av1/encoder/corner_detect.h b/aom_dsp/flow_estimation/corner_detect.h
index 15062f265..4481c4eb9 100644
--- a/av1/encoder/corner_detect.h
+++ b/aom_dsp/flow_estimation/corner_detect.h
@@ -9,14 +9,22 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
-#define AOM_AV1_ENCODER_CORNER_DETECT_H_
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <memory.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int av1_fast_corner_detect(unsigned char *buf, int width, int height,
                            int stride, int *points, int max_points);
 
-#endif  // AOM_AV1_ENCODER_CORNER_DETECT_H_
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
diff --git a/av1/encoder/corner_match.c b/aom_dsp/flow_estimation/corner_match.c
index 3631be901..f67560436 100644
--- a/av1/encoder/corner_match.c
+++ b/aom_dsp/flow_estimation/corner_match.c
@@ -13,9 +13,13 @@
 #include <memory.h>
 #include <math.h>
 
-#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
-#include "av1/encoder/corner_match.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_scale/yv12config.h"
 
 #define SEARCH_SZ 9
 #define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
@@ -139,7 +143,7 @@ static void improve_correspondence(unsigned char *frm, unsigned char *ref,
   }
 }
 
-int av1_determine_correspondence(unsigned char *src, int *src_corners,
+int aom_determine_correspondence(unsigned char *src, int *src_corners,
                                  int num_src_corners, unsigned char *ref,
                                  int *ref_corners, int num_ref_corners,
                                  int width, int height, int src_stride,
@@ -190,3 +194,72 @@ int av1_determine_correspondence(unsigned char *src, int *src_corners,
                          correspondences, num_correspondences);
   return num_correspondences;
 }
+
+static bool get_inliers_from_indices(MotionModel *params,
+                                     int *correspondences) {
+  int *inliers_tmp = (int *)aom_calloc(2 * MAX_CORNERS, sizeof(*inliers_tmp));
+  if (!inliers_tmp) return false;
+
+  for (int i = 0; i < params->num_inliers; i++) {
+    int index = params->inliers[i];
+    inliers_tmp[2 * i] = correspondences[4 * index];
+    inliers_tmp[2 * i + 1] = correspondences[4 * index + 1];
+  }
+  memcpy(params->inliers, inliers_tmp, sizeof(*inliers_tmp) * 2 * MAX_CORNERS);
+  aom_free(inliers_tmp);
+  return true;
+}
+
+int av1_compute_global_motion_feature_based(
+    TransformationType type, unsigned char *src_buffer, int src_width,
+    int src_height, int src_stride, int *src_corners, int num_src_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions) {
+  int i;
+  int num_ref_corners;
+  int num_correspondences;
+  int *correspondences;
+  int ref_corners[2 * MAX_CORNERS];
+  unsigned char *ref_buffer = ref->y_buffer;
+  RansacFunc ransac = av1_get_ransac_type(type);
+
+  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+    ref_buffer = av1_downconvert_frame(ref, bit_depth);
+  }
+
+  num_ref_corners =
+      av1_fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+                             ref->y_stride, ref_corners, MAX_CORNERS);
+
+  // find correspondences between the two images
+  correspondences =
+      (int *)aom_malloc(num_src_corners * 4 * sizeof(*correspondences));
+  if (!correspondences) return 0;
+  num_correspondences = aom_determine_correspondence(
+      src_buffer, (int *)src_corners, num_src_corners, ref_buffer,
+      (int *)ref_corners, num_ref_corners, src_width, src_height, src_stride,
+      ref->y_stride, correspondences);
+
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences ||
+        num_correspondences == 0) {
+      num_inliers_by_motion[i] = 0;
+    } else if (!get_inliers_from_indices(&params_by_motion[i],
+                                         correspondences)) {
+      aom_free(correspondences);
+      return 0;
+    }
+  }
+
+  aom_free(correspondences);
+
+  // Return true if any one of the motions has inliers.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) return 1;
+  }
+  return 0;
+}
diff --git a/av1/encoder/corner_match.h b/aom_dsp/flow_estimation/corner_match.h
index 45c90f32f..71afadf7b 100644
--- a/av1/encoder/corner_match.h
+++ b/aom_dsp/flow_estimation/corner_match.h
@@ -8,13 +8,21 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
-#define AOM_AV1_ENCODER_CORNER_MATCH_H_
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <memory.h>
 
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MATCH_SZ 13
 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
 #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
@@ -24,10 +32,20 @@ typedef struct {
   int rx, ry;
 } Correspondence;
 
-int av1_determine_correspondence(unsigned char *src, int *src_corners,
+int aom_determine_correspondence(unsigned char *src, int *src_corners,
                                  int num_src_corners, unsigned char *ref,
                                  int *ref_corners, int num_ref_corners,
                                  int width, int height, int src_stride,
                                  int ref_stride, int *correspondence_pts);
 
-#endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_
+int av1_compute_global_motion_feature_based(
+    TransformationType type, unsigned char *src_buffer, int src_width,
+    int src_height, int src_stride, int *src_corners, int num_src_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
diff --git a/aom_dsp/flow_estimation/disflow.c b/aom_dsp/flow_estimation/disflow.c
new file mode 100644
index 000000000..2a6ad4b67
--- /dev/null
+++ b/aom_dsp/flow_estimation/disflow.c
@@ -0,0 +1,634 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/flow_estimation/ransac.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/resize.h"
+
+// Number of pyramid levels in disflow computation
+#define N_LEVELS 2
+// Size of square patches in the disflow dense grid
+#define PATCH_SIZE 8
+// Center point of square patch
+#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
+// Step size between patches, lower value means greater patch overlap
+#define PATCH_STEP 1
+// Minimum size of border padding for disflow
+#define MIN_PAD 7
+// Warp error convergence threshold for disflow
+#define DISFLOW_ERROR_TR 0.01
+// Max number of iterations if warp convergence is not found
+#define DISFLOW_MAX_ITR 10
+
+// Struct for an image pyramid
+typedef struct {
+  int n_levels;
+  int pad_size;
+  int has_gradient;
+  int widths[N_LEVELS];
+  int heights[N_LEVELS];
+  int strides[N_LEVELS];
+  int level_loc[N_LEVELS];
+  unsigned char *level_buffer;
+  double *level_dx_buffer;
+  double *level_dy_buffer;
+} ImagePyramid;
+
+// Don't use points around the frame border since they are less reliable
+static INLINE int valid_point(int x, int y, int width, int height) {
+  return (x > (PATCH_SIZE + PATCH_CENTER)) &&
+         (x < (width - PATCH_SIZE - PATCH_CENTER)) &&
+         (y > (PATCH_SIZE + PATCH_CENTER)) &&
+         (y < (height - PATCH_SIZE - PATCH_CENTER));
+}
+
+static int determine_disflow_correspondence(int *frm_corners,
+                                            int num_frm_corners, double *flow_u,
+                                            double *flow_v, int width,
+                                            int height, int stride,
+                                            double *correspondences) {
+  int num_correspondences = 0;
+  int x, y;
+  for (int i = 0; i < num_frm_corners; ++i) {
+    x = frm_corners[2 * i];
+    y = frm_corners[2 * i + 1];
+    if (valid_point(x, y, width, height)) {
+      correspondences[4 * num_correspondences] = x;
+      correspondences[4 * num_correspondences + 1] = y;
+      correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x];
+      correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x];
+      num_correspondences++;
+    }
+  }
+  return num_correspondences;
+}
+
+static double getCubicValue(double p[4], double x) {
+  return p[1] + 0.5 * x *
+                    (p[2] - p[0] +
+                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static void get_subcolumn(unsigned char *ref, double col[4], int stride, int x,
+                          int y_start) {
+  int i;
+  for (i = 0; i < 4; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static double bicubic(unsigned char *ref, double x, double y, int stride) {
+  double arr[4];
+  int k;
+  int i = (int)x;
+  int j = (int)y;
+  for (k = 0; k < 4; ++k) {
+    double arr_temp[4];
+    get_subcolumn(ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] = getCubicValue(arr_temp, y - j);
+  }
+  return getCubicValue(arr, x - i);
+}
+
+// Interpolate a warped block using bicubic interpolation when possible
+static unsigned char interpolate(unsigned char *ref, double x, double y,
+                                 int width, int height, int stride) {
+  if (x < 0 && y < 0)
+    return ref[0];
+  else if (x < 0 && y > height - 1)
+    return ref[(height - 1) * stride];
+  else if (x > width - 1 && y < 0)
+    return ref[width - 1];
+  else if (x > width - 1 && y > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (x < 0) {
+    int v;
+    int i = (int)y;
+    double a = y - i;
+    if (y > 1 && y < height - 2) {
+      double arr[4];
+      get_subcolumn(ref, arr, stride, 0, i - 1);
+      return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255);
+    }
+    v = (int)(ref[i * stride] * (1 - a) + ref[(i + 1) * stride] * a + 0.5);
+    return clamp(v, 0, 255);
+  } else if (y < 0) {
+    int v;
+    int j = (int)x;
+    double b = x - j;
+    if (x > 1 && x < width - 2) {
+      double arr[4] = { ref[j - 1], ref[j], ref[j + 1], ref[j + 2] };
+      return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255);
+    }
+    v = (int)(ref[j] * (1 - b) + ref[j + 1] * b + 0.5);
+    return clamp(v, 0, 255);
+  } else if (x > width - 1) {
+    int v;
+    int i = (int)y;
+    double a = y - i;
+    if (y > 1 && y < height - 2) {
+      double arr[4];
+      get_subcolumn(ref, arr, stride, width - 1, i - 1);
+      return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255);
+    }
+    v = (int)(ref[i * stride + width - 1] * (1 - a) +
+              ref[(i + 1) * stride + width - 1] * a + 0.5);
+    return clamp(v, 0, 255);
+  } else if (y > height - 1) {
+    int v;
+    int j = (int)x;
+    double b = x - j;
+    if (x > 1 && x < width - 2) {
+      int row = (height - 1) * stride;
+      double arr[4] = { ref[row + j - 1], ref[row + j], ref[row + j + 1],
+                        ref[row + j + 2] };
+      return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255);
+    }
+    v = (int)(ref[(height - 1) * stride + j] * (1 - b) +
+              ref[(height - 1) * stride + j + 1] * b + 0.5);
+    return clamp(v, 0, 255);
+  } else if (x > 1 && y > 1 && x < width - 2 && y < height - 2) {
+    return clamp((int)(bicubic(ref, x, y, stride) + 0.5), 0, 255);
+  } else {
+    int i = (int)y;
+    int j = (int)x;
+    double a = y - i;
+    double b = x - j;
+    int v = (int)(ref[i * stride + j] * (1 - a) * (1 - b) +
+                  ref[i * stride + j + 1] * (1 - a) * b +
+                  ref[(i + 1) * stride + j] * a * (1 - b) +
+                  ref[(i + 1) * stride + j + 1] * a * b);
+    return clamp(v, 0, 255);
+  }
+}
+
+// Warps a block using flow vector [u, v] and computes the mse
+static double compute_warp_and_error(unsigned char *ref, unsigned char *frm,
+                                     int width, int height, int stride, int x,
+                                     int y, double u, double v, int16_t *dt) {
+  int i, j;
+  unsigned char warped;
+  double x_w, y_w;
+  double mse = 0;
+  int16_t err = 0;
+  for (i = y; i < y + PATCH_SIZE; ++i)
+    for (j = x; j < x + PATCH_SIZE; ++j) {
+      x_w = (double)j + u;
+      y_w = (double)i + v;
+      warped = interpolate(ref, x_w, y_w, width, height, stride);
+      err = warped - frm[j + i * stride];
+      mse += err * err;
+      dt[(i - y) * PATCH_SIZE + (j - x)] = err;
+    }
+
+  mse /= (PATCH_SIZE * PATCH_SIZE);
+  return mse;
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector. This includes:
+// 1.) The hessian matrix for optical flow. This matrix is in the
+// form of:
+//
+//       M = |sum(dx * dx)  sum(dx * dy)|
+//           |sum(dx * dy)  sum(dy * dy)|
+//
+// 2.)   b = |sum(dx * dt)|
+//           |sum(dy * dt)|
+// Where the sums are computed over a square window of PATCH_SIZE.
+static INLINE void compute_flow_system(const double *dx, int dx_stride,
+                                       const double *dy, int dy_stride,
+                                       const int16_t *dt, int dt_stride,
+                                       double *M, double *b) {
+  for (int i = 0; i < PATCH_SIZE; i++) {
+    for (int j = 0; j < PATCH_SIZE; j++) {
+      M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+      M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+      M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
+
+      b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
+      b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
+    }
+  }
+
+  M[2] = M[1];
+}
+
+// Solves a general Mx = b where M is a 2x2 matrix and b is a 2x1 matrix
+static INLINE void solve_2x2_system(const double *M, const double *b,
+                                    double *output_vec) {
+  double M_0 = M[0];
+  double M_3 = M[3];
+  double det = (M_0 * M_3) - (M[1] * M[2]);
+  if (det < 1e-5) {
+    // Handle singular matrix
+    // TODO(sarahparker) compare results using pseudo inverse instead
+    M_0 += 1e-10;
+    M_3 += 1e-10;
+    det = (M_0 * M_3) - (M[1] * M[2]);
+  }
+  const double det_inv = 1 / det;
+  const double mult_b0 = det_inv * b[0];
+  const double mult_b1 = det_inv * b[1];
+  output_vec[0] = M_3 * mult_b0 - M[1] * mult_b1;
+  output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
+}
+
+/*
+static INLINE void image_difference(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int16_t *dst, int dst_stride, int height,
+                                    int width) {
+  const int block_unit = 8;
+  // Take difference in 8x8 blocks to make use of optimized diff function
+  for (int i = 0; i < height; i += block_unit) {
+    for (int j = 0; j < width; j += block_unit) {
+      aom_subtract_block(block_unit, block_unit, dst + i * dst_stride + j,
+                         dst_stride, src + i * src_stride + j, src_stride,
+                         ref + i * ref_stride + j, ref_stride);
+    }
+  }
+}
+*/
+
+static INLINE void convolve_2d_sobel_y(const uint8_t *src, int src_stride,
+                                       double *dst, int dst_stride, int w,
+                                       int h, int dir, double norm) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
+  DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
+  const int taps = 3;
+  int im_h = h + taps - 1;
+  int im_stride = w;
+  const int fo_vert = 1;
+  const int fo_horiz = 1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = dir ? sobel_a : sobel_b;
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      im_block[y * im_stride + x] = sum;
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = dir ? sobel_b : sobel_a;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      dst[y * dst_stride + x] = sum * norm;
+    }
+  }
+}
+
+// Compute an image gradient using a sobel filter.
+// If dir == 1, compute the x gradient. If dir == 0, compute y. This function
+// assumes the images have been padded so that they can be processed in units
+// of 8.
+static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
+                                           double *dst, int dst_stride,
+                                           int height, int width, int dir) {
+  double norm = 1.0;
+  // TODO(sarahparker) experiment with doing this over larger block sizes
+  const int block_unit = 8;
+  // Filter in 8x8 blocks to eventually make use of optimized convolve function
+  for (int i = 0; i < height; i += block_unit) {
+    for (int j = 0; j < width; j += block_unit) {
+      convolve_2d_sobel_y(src + i * src_stride + j, src_stride,
+                          dst + i * dst_stride + j, dst_stride, block_unit,
+                          block_unit, dir, norm);
+    }
+  }
+}
+
+static void free_pyramid(ImagePyramid *pyr) {
+  aom_free(pyr->level_buffer);
+  if (pyr->has_gradient) {
+    aom_free(pyr->level_dx_buffer);
+    aom_free(pyr->level_dy_buffer);
+  }
+  aom_free(pyr);
+}
+
+static ImagePyramid *alloc_pyramid(int width, int height, int pad_size,
+                                   int compute_gradient) {
+  ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr));
+  if (!pyr) return NULL;
+  pyr->has_gradient = compute_gradient;
+  // 2 * width * height is the upper bound for a buffer that fits
+  // all pyramid levels + padding for each level
+  const int buffer_size = sizeof(*pyr->level_buffer) * 2 * width * height +
+                          (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
+  pyr->level_buffer = aom_malloc(buffer_size);
+  if (!pyr->level_buffer) {
+    free_pyramid(pyr);
+    return NULL;
+  }
+  memset(pyr->level_buffer, 0, buffer_size);
+
+  if (compute_gradient) {
+    const int gradient_size =
+        sizeof(*pyr->level_dx_buffer) * 2 * width * height +
+        (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
+    pyr->level_dx_buffer = aom_calloc(1, gradient_size);
+    pyr->level_dy_buffer = aom_calloc(1, gradient_size);
+    if (!(pyr->level_dx_buffer && pyr->level_dy_buffer)) {
+      free_pyramid(pyr);
+      return NULL;
+    }
+  }
+  return pyr;
+}
+
+static INLINE void update_level_dims(ImagePyramid *frm_pyr, int level) {
+  frm_pyr->widths[level] = frm_pyr->widths[level - 1] >> 1;
+  frm_pyr->heights[level] = frm_pyr->heights[level - 1] >> 1;
+  frm_pyr->strides[level] = frm_pyr->widths[level] + 2 * frm_pyr->pad_size;
+  // Point the beginning of the next level buffer to the correct location inside
+  // the padded border
+  frm_pyr->level_loc[level] =
+      frm_pyr->level_loc[level - 1] +
+      frm_pyr->strides[level - 1] *
+          (2 * frm_pyr->pad_size + frm_pyr->heights[level - 1]);
+}
+
+// Compute coarse to fine pyramids for a frame
+static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
+                                  const int frm_height, const int frm_stride,
+                                  int n_levels, int pad_size, int compute_grad,
+                                  ImagePyramid *frm_pyr) {
+  int cur_width, cur_height, cur_stride, cur_loc;
+  assert((frm_width >> n_levels) > 0);
+  assert((frm_height >> n_levels) > 0);
+
+  // Initialize first level
+  frm_pyr->n_levels = n_levels;
+  frm_pyr->pad_size = pad_size;
+  frm_pyr->widths[0] = frm_width;
+  frm_pyr->heights[0] = frm_height;
+  frm_pyr->strides[0] = frm_width + 2 * frm_pyr->pad_size;
+  // Point the beginning of the level buffer to the location inside
+  // the padded border
+  frm_pyr->level_loc[0] =
+      frm_pyr->strides[0] * frm_pyr->pad_size + frm_pyr->pad_size;
+  // This essentially copies the original buffer into the pyramid buffer
+  // without the original padding
+  av1_resize_plane(frm, frm_height, frm_width, frm_stride,
+                   frm_pyr->level_buffer + frm_pyr->level_loc[0],
+                   frm_pyr->heights[0], frm_pyr->widths[0],
+                   frm_pyr->strides[0]);
+
+  if (compute_grad) {
+    cur_width = frm_pyr->widths[0];
+    cur_height = frm_pyr->heights[0];
+    cur_stride = frm_pyr->strides[0];
+    cur_loc = frm_pyr->level_loc[0];
+    assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
+           frm_pyr->level_dy_buffer != NULL);
+    // Computation x gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 1);
+
+    // Computation y gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 0);
+  }
+
+  // Start at the finest level and resize down to the coarsest level
+  for (int level = 1; level < n_levels; ++level) {
+    update_level_dims(frm_pyr, level);
+    cur_width = frm_pyr->widths[level];
+    cur_height = frm_pyr->heights[level];
+    cur_stride = frm_pyr->strides[level];
+    cur_loc = frm_pyr->level_loc[level];
+
+    av1_resize_plane(frm_pyr->level_buffer + frm_pyr->level_loc[level - 1],
+                     frm_pyr->heights[level - 1], frm_pyr->widths[level - 1],
+                     frm_pyr->strides[level - 1],
+                     frm_pyr->level_buffer + cur_loc, cur_height, cur_width,
+                     cur_stride);
+
+    if (compute_grad) {
+      assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
+             frm_pyr->level_dy_buffer != NULL);
+      // Computation x gradient
+      sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                              frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+                              cur_height, cur_width, 1);
+
+      // Computation y gradient
+      sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                              frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+                              cur_height, cur_width, 0);
+    }
+  }
+}
+
+static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
+                                         double *dx, double *dy, int x, int y,
+                                         int width, int height, int stride,
+                                         double *u, double *v) {
+  double M[4] = { 0 };
+  double b[2] = { 0 };
+  double tmp_output_vec[2] = { 0 };
+  double error = 0;
+  int16_t dt[PATCH_SIZE * PATCH_SIZE];
+  double o_u = *u;
+  double o_v = *v;
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
+                                   *v, dt);
+    if (error <= DISFLOW_ERROR_TR) break;
+    compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b);
+    solve_2x2_system(M, b, tmp_output_vec);
+    *u += tmp_output_vec[0];
+    *v += tmp_output_vec[1];
+  }
+  if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) {
+    *u = o_u;
+    *v = o_v;
+  }
+}
+
+// make sure flow_u and flow_v start at 0
+static bool compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
+                               double *flow_u, double *flow_v) {
+  int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
+  double *u_upscale =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  double *v_upscale =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+  if (!(u_upscale && v_upscale)) {
+    aom_free(u_upscale);
+    aom_free(v_upscale);
+    return false;
+  }
+
+  assert(frm_pyr->n_levels == ref_pyr->n_levels);
+
+  // Compute flow field from coarsest to finest level of the pyramid
+  for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
+    cur_width = frm_pyr->widths[level];
+    cur_height = frm_pyr->heights[level];
+    cur_stride = frm_pyr->strides[level];
+    cur_loc = frm_pyr->level_loc[level];
+
+    for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
+      for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
+        patch_loc = i * cur_stride + j;
+        patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
+        compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
+                              ref_pyr->level_buffer + cur_loc,
+                              frm_pyr->level_dx_buffer + cur_loc + patch_loc,
+                              frm_pyr->level_dy_buffer + cur_loc + patch_loc, j,
+                              i, cur_width, cur_height, cur_stride,
+                              flow_u + patch_center, flow_v + patch_center);
+      }
+    }
+    // TODO(sarahparker) Replace this with upscale function in resize.c
+    if (level > 0) {
+      int h_upscale = frm_pyr->heights[level - 1];
+      int w_upscale = frm_pyr->widths[level - 1];
+      int s_upscale = frm_pyr->strides[level - 1];
+      for (int i = 0; i < h_upscale; ++i) {
+        for (int j = 0; j < w_upscale; ++j) {
+          u_upscale[j + i * s_upscale] =
+              flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+          v_upscale[j + i * s_upscale] =
+              flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+        }
+      }
+      memcpy(flow_u, u_upscale,
+             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+      memcpy(flow_v, v_upscale,
+             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+    }
+  }
+  aom_free(u_upscale);
+  aom_free(v_upscale);
+  return true;
+}
+
+int av1_compute_global_motion_disflow_based(
+    TransformationType type, unsigned char *frm_buffer, int frm_width,
+    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions) {
+  unsigned char *ref_buffer = ref->y_buffer;
+  const int ref_width = ref->y_width;
+  const int ref_height = ref->y_height;
+  const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
+  int num_correspondences;
+  double *correspondences;
+  RansacFuncDouble ransac = av1_get_ransac_double_prec_type(type);
+  assert(frm_width == ref_width);
+  assert(frm_height == ref_height);
+
+  // Ensure the number of pyramid levels will work with the frame resolution
+  const int msb =
+      frm_width < frm_height ? get_msb(frm_width) : get_msb(frm_height);
+  const int n_levels = AOMMIN(msb, N_LEVELS);
+
+  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+    ref_buffer = av1_downconvert_frame(ref, bit_depth);
+  }
+
+  // TODO(sarahparker) We will want to do the source pyramid computation
+  // outside of this function so it doesn't get recomputed for every
+  // reference. We also don't need to compute every pyramid level for the
+  // reference in advance, since lower levels can be overwritten once their
+  // flow field is computed and upscaled. I'll add these optimizations
+  // once the full implementation is working.
+  // Allocate frm image pyramids
+  int compute_gradient = 1;
+  ImagePyramid *frm_pyr =
+      alloc_pyramid(frm_width, frm_height, pad_size, compute_gradient);
+  if (!frm_pyr) return 0;
+  compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm_stride, n_levels,
+                        pad_size, compute_gradient, frm_pyr);
+  // Allocate ref image pyramids
+  compute_gradient = 0;
+  ImagePyramid *ref_pyr =
+      alloc_pyramid(ref_width, ref_height, pad_size, compute_gradient);
+  if (!ref_pyr) {
+    free_pyramid(frm_pyr);
+    return 0;
+  }
+  compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride,
+                        n_levels, pad_size, compute_gradient, ref_pyr);
+
+  int ret = 0;
+  double *flow_u =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  double *flow_v =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+  if (!(flow_u && flow_v)) goto Error;
+
+  memset(flow_u, 0,
+         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  memset(flow_v, 0,
+         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  if (!compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v)) goto Error;
+
+  // find correspondences between the two images using the flow field
+  correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  if (!correspondences) goto Error;
+  num_correspondences = determine_disflow_correspondence(
+      frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height,
+      frm_pyr->strides[0], correspondences);
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (int i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+      num_inliers_by_motion[i] = 0;
+    }
+  }
+
+  // Return true if any one of the motions has inliers.
+  for (int i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) {
+      ret = 1;
+      break;
+    }
+  }
+
+  aom_free(correspondences);
+Error:
+  free_pyramid(frm_pyr);
+  free_pyramid(ref_pyr);
+  aom_free(flow_u);
+  aom_free(flow_v);
+  return ret;
+}
diff --git a/aom_dsp/flow_estimation/disflow.h b/aom_dsp/flow_estimation/disflow.h
new file mode 100644
index 000000000..52fb261a4
--- /dev/null
+++ b/aom_dsp/flow_estimation/disflow.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
+
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_compute_global_motion_disflow_based(
+    TransformationType type, unsigned char *frm_buffer, int frm_width,
+    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
diff --git a/aom_dsp/flow_estimation/flow_estimation.c b/aom_dsp/flow_estimation/flow_estimation.c
new file mode 100644
index 000000000..d8cf8bdbc
--- /dev/null
+++ b/aom_dsp/flow_estimation/flow_estimation.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_dsp/flow_estimation/corner_match.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+int aom_compute_global_motion(TransformationType type,
+                              unsigned char *src_buffer, int src_width,
+                              int src_height, int src_stride, int *src_corners,
+                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
+                              int bit_depth,
+                              GlobalMotionEstimationType gm_estimation_type,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion, int num_motions) {
+  switch (gm_estimation_type) {
+    case GLOBAL_MOTION_FEATURE_BASED:
+      return av1_compute_global_motion_feature_based(
+          type, src_buffer, src_width, src_height, src_stride, src_corners,
+          num_src_corners, ref, bit_depth, num_inliers_by_motion,
+          params_by_motion, num_motions);
+    case GLOBAL_MOTION_DISFLOW_BASED:
+      return av1_compute_global_motion_disflow_based(
+          type, src_buffer, src_width, src_height, src_stride, src_corners,
+          num_src_corners, ref, bit_depth, num_inliers_by_motion,
+          params_by_motion, num_motions);
+    default: assert(0 && "Unknown global motion estimation type");
+  }
+  return 0;
+}
+
+unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) {
+  int i, j;
+  uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
+  uint8_t *buf_8bit = frm->y_buffer_8bit;
+  assert(buf_8bit);
+  if (!frm->buf_8bit_valid) {
+    for (i = 0; i < frm->y_height; ++i) {
+      for (j = 0; j < frm->y_width; ++j) {
+        buf_8bit[i * frm->y_stride + j] =
+            orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+      }
+    }
+    frm->buf_8bit_valid = 1;
+  }
+  return buf_8bit;
+}
diff --git a/aom_dsp/flow_estimation/flow_estimation.h b/aom_dsp/flow_estimation/flow_estimation.h
new file mode 100644
index 000000000..ab9d328dc
--- /dev/null
+++ b/aom_dsp/flow_estimation/flow_estimation.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_H_
+
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_PARAMDIM 9
+#define MAX_CORNERS 4096
+#define MIN_INLIER_PROB 0.1
+
+/* clang-format off */
+enum {
+  IDENTITY = 0,      // identity transformation, 0-parameter
+  TRANSLATION = 1,   // translational motion 2-parameter
+  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
+  AFFINE = 3,        // affine, 6-parameter
+  TRANS_TYPES,
+} UENUM1BYTE(TransformationType);
+/* clang-format on */
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+
+typedef enum {
+  GLOBAL_MOTION_FEATURE_BASED,
+  GLOBAL_MOTION_DISFLOW_BASED,
+} GlobalMotionEstimationType;
+
+typedef struct {
+  double params[MAX_PARAMDIM - 1];
+  int *inliers;
+  int num_inliers;
+} MotionModel;
+
+int aom_compute_global_motion(TransformationType type,
+                              unsigned char *src_buffer, int src_width,
+                              int src_height, int src_stride, int *src_corners,
+                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
+                              int bit_depth,
+                              GlobalMotionEstimationType gm_estimation_type,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion, int num_motions);
+
+unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_H_
diff --git a/av1/encoder/ransac.c b/aom_dsp/flow_estimation/ransac.c
index f878fce6a..8ffc30d7a 100644
--- a/av1/encoder/ransac.c
+++ b/aom_dsp/flow_estimation/ransac.c
@@ -8,6 +8,7 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #include <memory.h>
 #include <math.h>
 #include <time.h>
@@ -15,8 +16,10 @@
 #include <stdlib.h>
 #include <assert.h>
 
+#include "aom_dsp/flow_estimation/ransac.h"
 #include "aom_dsp/mathutils.h"
-#include "av1/encoder/ransac.h"
+
+// TODO(rachelbarker): Remove dependence on code in av1/encoder/
 #include "av1/encoder/random.h"
 
 #define MAX_MINPTS 4
diff --git a/av1/encoder/ransac.h b/aom_dsp/flow_estimation/ransac.h
index 583d97152..aa3a2439a 100644
--- a/av1/encoder/ransac.h
+++ b/aom_dsp/flow_estimation/ransac.h
@@ -9,16 +9,19 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_RANSAC_H_
-#define AOM_AV1_ENCODER_RANSAC_H_
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <memory.h>
 
-#include "av1/common/warped_motion.h"
-#include "av1/encoder/global_motion.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 typedef int (*RansacFunc)(int *matched_points, int npoints,
                           int *num_inliers_by_motion,
@@ -28,4 +31,9 @@ typedef int (*RansacFuncDouble)(double *matched_points, int npoints,
                                 MotionModel *params_by_motion, int num_motions);
 RansacFunc av1_get_ransac_type(TransformationType type);
 RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type);
-#endif  // AOM_AV1_ENCODER_RANSAC_H_
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
diff --git a/av1/encoder/x86/corner_match_avx2.c b/aom_dsp/flow_estimation/x86/corner_match_avx2.c
index 033ae3773..9830ad820 100644
--- a/av1/encoder/x86/corner_match_avx2.c
+++ b/aom_dsp/flow_estimation/x86/corner_match_avx2.c
@@ -12,10 +12,10 @@
 #include <math.h>
 
 #include <immintrin.h>
-#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
-#include "av1/encoder/corner_match.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
 
 DECLARE_ALIGNED(16, static const uint8_t,
                 byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
diff --git a/av1/encoder/x86/corner_match_sse4.c b/aom_dsp/flow_estimation/x86/corner_match_sse4.c
index 1a879dad3..40eec6cc4 100644
--- a/av1/encoder/x86/corner_match_sse4.c
+++ b/aom_dsp/flow_estimation/x86/corner_match_sse4.c
@@ -16,10 +16,10 @@
 
 #include <smmintrin.h>
 
-#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
-#include "av1/encoder/corner_match.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
 
 DECLARE_ALIGNED(16, static const uint8_t,
                 byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 00396c8e7..6ec091f5f 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -52,9 +52,9 @@ static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
   const int p_top_left = abs_diff(base, top_left);
 
   // Return nearest to base of left, top and top_left.
-  return (p_left <= p_top && p_left <= p_top_left)
-             ? left
-             : (p_top <= p_top_left) ? top : top_left;
+  return (p_left <= p_top && p_left <= p_top_left) ? left
+         : (p_top <= p_top_left)                   ? top
+                                                   : top_left;
 }
 
 static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
diff --git a/aom_dsp/mathutils.h b/aom_dsp/mathutils.h
index 3ffca8a17..22b020257 100644
--- a/aom_dsp/mathutils.h
+++ b/aom_dsp/mathutils.h
@@ -108,255 +108,4 @@ static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
   }
 }
 
-//
-// The functions below are needed only for homography computation
-// Remove if the homography models are not used.
-//
-///////////////////////////////////////////////////////////////////////////////
-// svdcmp
-// Adopted from Numerical Recipes in C
-
-static INLINE double apply_sign(double a, double b) {
-  return ((b) >= 0 ? fabs(a) : -fabs(a));
-}
-
-static INLINE double pythag(double a, double b) {
-  double ct;
-  const double absa = fabs(a);
-  const double absb = fabs(b);
-
-  if (absa > absb) {
-    ct = absb / absa;
-    return absa * sqrt(1.0 + ct * ct);
-  } else {
-    ct = absa / absb;
-    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
-  }
-}
-
-static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
-  const int max_its = 30;
-  int flag, i, its, j, jj, k, l, nm;
-  double anorm, c, f, g, h, s, scale, x, y, z;
-  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
-  if (!rv1) return 0;
-  g = scale = anorm = 0.0;
-  for (i = 0; i < n; i++) {
-    l = i + 1;
-    rv1[i] = scale * g;
-    g = s = scale = 0.0;
-    if (i < m) {
-      for (k = i; k < m; k++) scale += fabs(u[k][i]);
-      if (scale != 0.) {
-        for (k = i; k < m; k++) {
-          u[k][i] /= scale;
-          s += u[k][i] * u[k][i];
-        }
-        f = u[i][i];
-        g = -apply_sign(sqrt(s), f);
-        h = f * g - s;
-        u[i][i] = f - g;
-        for (j = l; j < n; j++) {
-          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
-          f = s / h;
-          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
-        }
-        for (k = i; k < m; k++) u[k][i] *= scale;
-      }
-    }
-    w[i] = scale * g;
-    g = s = scale = 0.0;
-    if (i < m && i != n - 1) {
-      for (k = l; k < n; k++) scale += fabs(u[i][k]);
-      if (scale != 0.) {
-        for (k = l; k < n; k++) {
-          u[i][k] /= scale;
-          s += u[i][k] * u[i][k];
-        }
-        f = u[i][l];
-        g = -apply_sign(sqrt(s), f);
-        h = f * g - s;
-        u[i][l] = f - g;
-        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
-        for (j = l; j < m; j++) {
-          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
-          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
-        }
-        for (k = l; k < n; k++) u[i][k] *= scale;
-      }
-    }
-    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
-  }
-
-  for (i = n - 1; i >= 0; i--) {
-    if (i < n - 1) {
-      if (g != 0.) {
-        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
-        for (j = l; j < n; j++) {
-          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
-          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
-        }
-      }
-      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
-    }
-    v[i][i] = 1.0;
-    g = rv1[i];
-    l = i;
-  }
-  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
-    l = i + 1;
-    g = w[i];
-    for (j = l; j < n; j++) u[i][j] = 0.0;
-    if (g != 0.) {
-      g = 1.0 / g;
-      for (j = l; j < n; j++) {
-        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
-        f = (s / u[i][i]) * g;
-        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
-      }
-      for (j = i; j < m; j++) u[j][i] *= g;
-    } else {
-      for (j = i; j < m; j++) u[j][i] = 0.0;
-    }
-    ++u[i][i];
-  }
-  for (k = n - 1; k >= 0; k--) {
-    for (its = 0; its < max_its; its++) {
-      flag = 1;
-      for (l = k; l >= 0; l--) {
-        nm = l - 1;
-        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
-          flag = 0;
-          break;
-        }
-        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
-      }
-      if (flag) {
-        c = 0.0;
-        s = 1.0;
-        for (i = l; i <= k; i++) {
-          f = s * rv1[i];
-          rv1[i] = c * rv1[i];
-          if ((double)(fabs(f) + anorm) == anorm) break;
-          g = w[i];
-          h = pythag(f, g);
-          w[i] = h;
-          h = 1.0 / h;
-          c = g * h;
-          s = -f * h;
-          for (j = 0; j < m; j++) {
-            y = u[j][nm];
-            z = u[j][i];
-            u[j][nm] = y * c + z * s;
-            u[j][i] = z * c - y * s;
-          }
-        }
-      }
-      z = w[k];
-      if (l == k) {
-        if (z < 0.0) {
-          w[k] = -z;
-          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
-        }
-        break;
-      }
-      if (its == max_its - 1) {
-        aom_free(rv1);
-        return 1;
-      }
-      assert(k > 0);
-      x = w[l];
-      nm = k - 1;
-      y = w[nm];
-      g = rv1[nm];
-      h = rv1[k];
-      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
-      g = pythag(f, 1.0);
-      f = ((x - z) * (x + z) + h * ((y / (f + apply_sign(g, f))) - h)) / x;
-      c = s = 1.0;
-      for (j = l; j <= nm; j++) {
-        i = j + 1;
-        g = rv1[i];
-        y = w[i];
-        h = s * g;
-        g = c * g;
-        z = pythag(f, h);
-        rv1[j] = z;
-        c = f / z;
-        s = h / z;
-        f = x * c + g * s;
-        g = g * c - x * s;
-        h = y * s;
-        y *= c;
-        for (jj = 0; jj < n; jj++) {
-          x = v[jj][j];
-          z = v[jj][i];
-          v[jj][j] = x * c + z * s;
-          v[jj][i] = z * c - x * s;
-        }
-        z = pythag(f, h);
-        w[j] = z;
-        if (z != 0.) {
-          z = 1.0 / z;
-          c = f * z;
-          s = h * z;
-        }
-        f = c * g + s * y;
-        x = c * y - s * g;
-        for (jj = 0; jj < m; jj++) {
-          y = u[jj][j];
-          z = u[jj][i];
-          u[jj][j] = y * c + z * s;
-          u[jj][i] = z * c - y * s;
-        }
-      }
-      rv1[l] = 0.0;
-      rv1[k] = f;
-      w[k] = x;
-    }
-  }
-  aom_free(rv1);
-  return 0;
-}
-
-static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
-                      int N) {
-  // Assumes allocation for U is MxN
-  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
-  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
-  int problem, i;
-
-  problem = !(nrU && nrV);
-  if (!problem) {
-    for (i = 0; i < M; i++) {
-      nrU[i] = &U[i * N];
-    }
-    for (i = 0; i < N; i++) {
-      nrV[i] = &V[i * N];
-    }
-  } else {
-    aom_free(nrU);
-    aom_free(nrV);
-    return 1;
-  }
-
-  /* copy from given matx into nrU */
-  for (i = 0; i < M; i++) {
-    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
-  }
-
-  /* HERE IT IS: do SVD */
-  if (svdcmp(nrU, M, N, W, nrV)) {
-    aom_free(nrU);
-    aom_free(nrV);
-    return 1;
-  }
-
-  /* aom_free Numerical Recipes arrays */
-  aom_free(nrU);
-  aom_free(nrV);
-
-  return 0;
-}
-
 #endif  // AOM_AOM_DSP_MATHUTILS_H_
diff --git a/aom_dsp/mips/aom_convolve8_horiz_msa.c b/aom_dsp/mips/aom_convolve8_horiz_msa.c
deleted file mode 100644
index c8ab61249..000000000
--- a/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v8i16 filt, out0, out1;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out0, out1);
-  SRARI_H2_SH(out0, out1, FILTER_BITS);
-  SAT_SH2_SH(out0, out1, 7);
-  out = PCKEV_XORI128_UB(out0, out1);
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src0, src1, src2, src3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  src += (4 * src_stride);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out0, out1);
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out2, out3);
-  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-  SAT_SH4_SH(out0, out1, out2, out3, 7);
-  out = PCKEV_XORI128_UB(out0, out1);
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  out = PCKEV_XORI128_UB(out2, out3);
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out0, out1, out2,
-                             out3);
-  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-  SAT_SH4_SH(out0, out1, out2, out3, 7);
-  tmp0 = PCKEV_XORI128_UB(out0, out1);
-  tmp1 = PCKEV_XORI128_UB(out2, out3);
-  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-}
-
-static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    src += (4 * src_stride);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    tmp0 = PCKEV_XORI128_UB(out0, out1);
-    tmp1 = PCKEV_XORI128_UB(out2, out3);
-    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
-  }
-}
-
-static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LD_SB2(src, src_stride, src0, src2);
-    LD_SB2(src + 8, src_stride, src1, src3);
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    src += (2 * src_stride);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    dst += dst_stride;
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 16);
-    dst += dst_stride;
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 16);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  int32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 16);
-
-    src0 = LD_SB(src + 32);
-    src2 = LD_SB(src + 48);
-    src3 = LD_SB(src + 56);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst + 32);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 48);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, vec0, vec1, res0, res1;
-  v8u16 vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
-  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 vec0, vec1, vec2, vec3, filt0;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16i8 res0, res1, res2, res3;
-  v8u16 vec4, vec5, vec6, vec7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
-              vec6, vec7);
-  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 filt0;
-  v16i8 src0, src1, src2, src3, mask;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
-  ST8x4_UB(src0, src1, dst, dst_stride);
-}
-
-static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter, int32_t height) {
-  v16u8 filt0;
-  v16i8 src0, src1, src2, src3, mask, out0, out1;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-  dst += (4 * dst_stride);
-
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-  dst += (4 * dst_stride);
-
-  if (16 == height) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-
-    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
-  }
-}
-
-static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
-  }
-}
-
-static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  loop_cnt = (height >> 2) - 1;
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src2, src4, src6);
-  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-  src += (4 * src_stride);
-
-  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-              out2, out3);
-  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-              out6, out7);
-  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-  PCKEV_ST_SB(out0, out1, dst);
-  dst += dst_stride;
-  PCKEV_ST_SB(out2, out3, dst);
-  dst += dst_stride;
-  PCKEV_ST_SB(out4, out5, dst);
-  dst += dst_stride;
-  PCKEV_ST_SB(out6, out7, dst);
-  dst += dst_stride;
-
-  for (; loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_ST_SB(out0, out1, dst);
-    dst += dst_stride;
-    PCKEV_ST_SB(out2, out3, dst);
-    dst += dst_stride;
-    PCKEV_ST_SB(out4, out5, dst);
-    dst += dst_stride;
-    PCKEV_ST_SB(out6, out7, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = height >> 1; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-    src4 = LD_SB(src);
-    src6 = LD_SB(src + 16);
-    src7 = LD_SB(src + 24);
-    src5 = __msa_sldi_b(src6, src4, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_ST_SB(out0, out1, dst);
-    PCKEV_ST_SB(out2, out3, dst + 16);
-    dst += dst_stride;
-    PCKEV_ST_SB(out4, out5, dst);
-    PCKEV_ST_SB(out6, out7, dst + 16);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = height; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src4 = LD_SB(src + 32);
-    src6 = LD_SB(src + 48);
-    src7 = LD_SB(src + 56);
-    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_ST_SB(out0, out1, dst);
-    PCKEV_ST_SB(out2, out3, dst + 16);
-    PCKEV_ST_SB(out4, out5, dst + 32);
-    PCKEV_ST_SB(out6, out7, dst + 48);
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  int8_t cnt, filt_hor[8];
-
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_hor[3], h);
-        break;
-      case 8:
-        common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_hor[3], h);
-        break;
-      case 16:
-        common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_hor[3], h);
-        break;
-      case 32:
-        common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_hor[3], h);
-        break;
-      case 64:
-        common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_hor[3], h);
-        break;
-      default:
-        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_hor, h);
-        break;
-      case 8:
-        common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_hor, h);
-        break;
-      case 16:
-        common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_hor, h);
-        break;
-      case 32:
-        common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_hor, h);
-        break;
-      case 64:
-        common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_hor, h);
-        break;
-      default:
-        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/aom_dsp/mips/aom_convolve8_vert_msa.c b/aom_dsp/mips/aom_convolve8_vert_msa.c
deleted file mode 100644
index 2c3bc084c..000000000
--- a/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
-  v16i8 src10998, filt0, filt1, filt2, filt3;
-  v16u8 out;
-  v8i16 filt, out10, out32;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
-             src4332, src6554);
-  XORI_B3_128_SB(src2110, src4332, src6554);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
-    XORI_B2_128_SB(src8776, src10998);
-    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
-                                filt1, filt2, filt3);
-    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
-                                filt1, filt2, filt3);
-    SRARI_H2_SH(out10, out32, FILTER_BITS);
-    SAT_SH2_SH(out10, out32, 7);
-    out = PCKEV_XORI128_UB(out10, out32);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src2110 = src6554;
-    src4332 = src8776;
-    src6554 = src10998;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
-  v16u8 tmp0, tmp1;
-  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                 filt1, filt2, filt3);
-    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                 filt1, filt2, filt3);
-    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                 filt1, filt2, filt3);
-    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                 filt1, filt2, filt3);
-    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
-    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
-    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src10_r = src54_r;
-    src32_r = src76_r;
-    src54_r = src98_r;
-    src21_r = src65_r;
-    src43_r = src87_r;
-    src65_r = src109_r;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
-  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
-             src54_l, src21_l);
-  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
-               src87_l, src98_l, src109_l);
-    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                 filt1, filt2, filt3);
-    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                 filt1, filt2, filt3);
-    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                 filt1, filt2, filt3);
-    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                 filt1, filt2, filt3);
-    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
-                                 filt1, filt2, filt3);
-    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
-                                 filt1, filt2, filt3);
-    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
-                                 filt1, filt2, filt3);
-    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
-                                 filt1, filt2, filt3);
-    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
-    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
-    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
-                tmp0, tmp1, tmp2, tmp3);
-    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
-    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src10_r = src54_r;
-    src32_r = src76_r;
-    src54_r = src98_r;
-    src21_r = src65_r;
-    src43_r = src87_r;
-    src65_r = src109_r;
-    src10_l = src54_l;
-    src32_l = src76_l;
-    src54_l = src98_l;
-    src21_l = src65_l;
-    src43_l = src87_l;
-    src65_l = src109_l;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter, int32_t height,
-                                      int32_t width) {
-  const uint8_t *src_tmp;
-  uint8_t *dst_tmp;
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
-  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  for (cnt = (width >> 4); cnt--;) {
-    src_tmp = src;
-    dst_tmp = dst;
-
-    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
-    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-    src_tmp += (7 * src_stride);
-    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-               src54_r, src21_r);
-    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
-               src54_l, src21_l);
-    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
-    for (loop_cnt = (height >> 2); loop_cnt--;) {
-      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
-      XORI_B4_128_SB(src7, src8, src9, src10);
-      src_tmp += (4 * src_stride);
-      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-                 src87_r, src98_r, src109_r);
-      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
-                 src87_l, src98_l, src109_l);
-      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                   filt1, filt2, filt3);
-      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                   filt1, filt2, filt3);
-      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                   filt1, filt2, filt3);
-      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                   filt1, filt2, filt3);
-      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
-                                   filt1, filt2, filt3);
-      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
-                                   filt1, filt2, filt3);
-      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
-                                   filt1, filt2, filt3);
-      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
-                                   filt1, filt2, filt3);
-      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
-      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
-      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
-                  out3_r, tmp0, tmp1, tmp2, tmp3);
-      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
-      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
-      dst_tmp += (4 * dst_stride);
-
-      src10_r = src54_r;
-      src32_r = src76_r;
-      src54_r = src98_r;
-      src21_r = src65_r;
-      src43_r = src87_r;
-      src65_r = src109_r;
-      src10_l = src54_l;
-      src32_l = src76_l;
-      src54_l = src98_l;
-      src21_l = src65_l;
-      src43_l = src87_l;
-      src65_l = src109_l;
-      src6 = src10;
-    }
-
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
-                            32);
-}
-
-static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
-                            64);
-}
-
-static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4;
-  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
-  v16u8 filt0;
-  v8i16 filt;
-  v8u16 tmp0, tmp1;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 filt;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-
-  src8 = LD_SB(src);
-
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
-             src76_r, src87_r);
-  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
-             src76_r, src2110, src4332, src6554, src8776);
-  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
-              tmp0, tmp1, tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
-}
-
-static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
-  v16i8 out0, out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
-  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-              tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v16i8 out0, out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
-    src += (8 * src_stride);
-
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
-               vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src8;
-  }
-}
-
-static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
-  }
-}
-
-static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-    dst += dst_stride;
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst);
-    dst += dst_stride;
-
-    src0 = src4;
-  }
-}
-
-static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src5 = LD_UB(src + 16);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
-    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
-    src += (4 * src_stride);
-
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
-
-    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
-    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
-    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src4;
-    src5 = src9;
-  }
-}
-
-static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_UB4(src, 16, src0, src3, src6, src9);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LD_UB2(src, src_stride, src1, src2);
-    LD_UB2(src + 16, src_stride, src4, src5);
-    LD_UB2(src + 32, src_stride, src7, src8);
-    LD_UB2(src + 48, src_stride, src10, src11);
-    src += (2 * src_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
-    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
-    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
-    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
-
-    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
-    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
-    dst += (2 * dst_stride);
-
-    src0 = src2;
-    src3 = src5;
-    src6 = src8;
-    src9 = src11;
-  }
-}
-
-void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  int8_t cnt, filt_ver[8];
-
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 8; cnt--;) {
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_ver[3], h);
-        break;
-      case 8:
-        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_ver[3], h);
-        break;
-      case 16:
-        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_ver[3], h);
-        break;
-      case 32:
-        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_ver[3], h);
-        break;
-      case 64:
-        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_ver[3], h);
-        break;
-      default:
-        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_ver, h);
-        break;
-      case 8:
-        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_ver, h);
-        break;
-      case 16:
-        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_ver, h);
-        break;
-      case 32:
-        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_ver, h);
-        break;
-      case 64:
-        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_ver, h);
-        break;
-      default:
-        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/aom_dsp/mips/aom_convolve_copy_dspr2.c b/aom_dsp/mips/aom_convolve_copy_dspr2.c
deleted file mode 100644
index 12a213eaa..000000000
--- a/aom_dsp/mips/aom_convolve_copy_dspr2.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
-  int x, y;
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4: {
-      uint32_t tp1;
-
-      /* 1 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         (%[src])      \n\t"
-            "sw               %[tp1],         (%[dst])      \n\t" /* store */
-
-            : [tp1] "=&r"(tp1)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 8: {
-      uint32_t tp1, tp2;
-
-      /* 2 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 16: {
-      uint32_t tp1, tp2, tp3, tp4;
-
-      /* 4 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "ulw              %[tp3],         8(%[src])      \n\t"
-            "ulw              %[tp4],         12(%[src])     \n\t"
-
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 32: {
-      uint32_t tp1, tp2, tp3, tp4;
-      uint32_t tp5, tp6, tp7, tp8;
-
-      /* 8 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "ulw              %[tp3],         8(%[src])      \n\t"
-            "ulw              %[tp4],         12(%[src])     \n\t"
-            "ulw              %[tp5],         16(%[src])     \n\t"
-            "ulw              %[tp6],         20(%[src])     \n\t"
-            "ulw              %[tp7],         24(%[src])     \n\t"
-            "ulw              %[tp8],         28(%[src])     \n\t"
-
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
-            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
-            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
-            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
-            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
-              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 64: {
-      uint32_t tp1, tp2, tp3, tp4;
-      uint32_t tp5, tp6, tp7, tp8;
-
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      /* 16 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_load(src + src_stride + 64);
-        prefetch_store(dst + dst_stride);
-        prefetch_store(dst + dst_stride + 32);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "ulw              %[tp3],         8(%[src])      \n\t"
-            "ulw              %[tp4],         12(%[src])     \n\t"
-            "ulw              %[tp5],         16(%[src])     \n\t"
-            "ulw              %[tp6],         20(%[src])     \n\t"
-            "ulw              %[tp7],         24(%[src])     \n\t"
-            "ulw              %[tp8],         28(%[src])     \n\t"
-
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
-            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
-            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
-            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
-            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
-
-            "ulw              %[tp1],         32(%[src])     \n\t"
-            "ulw              %[tp2],         36(%[src])     \n\t"
-            "ulw              %[tp3],         40(%[src])     \n\t"
-            "ulw              %[tp4],         44(%[src])     \n\t"
-            "ulw              %[tp5],         48(%[src])     \n\t"
-            "ulw              %[tp6],         52(%[src])     \n\t"
-            "ulw              %[tp7],         56(%[src])     \n\t"
-            "ulw              %[tp8],         60(%[src])     \n\t"
-
-            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
-            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
-            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
-            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
-            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
-            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
-            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
-            "sw               %[tp8],         60(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
-              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    default:
-      for (y = h; y--;) {
-        for (x = 0; x < w; ++x) {
-          dst[x] = src[x];
-        }
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-  }
-}
-#endif
diff --git a/aom_dsp/mips/aom_convolve_copy_msa.c b/aom_dsp/mips/aom_convolve_copy_msa.c
deleted file mode 100644
index 12e7d9539..000000000
--- a/aom_dsp/mips/aom_convolve_copy_msa.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/mips/macros_msa.h"
-
-static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  if (0 == height % 12) {
-    for (cnt = (height / 12); cnt--;) {
-      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-      src += (8 * src_stride);
-
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-      out4 = __msa_copy_u_d((v2i64)src4, 0);
-      out5 = __msa_copy_u_d((v2i64)src5, 0);
-      out6 = __msa_copy_u_d((v2i64)src6, 0);
-      out7 = __msa_copy_u_d((v2i64)src7, 0);
-
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-      SD4(out4, out5, out6, out7, dst, dst_stride);
-      dst += (4 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 8) {
-    for (cnt = height >> 3; cnt--;) {
-      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-      src += (8 * src_stride);
-
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-      out4 = __msa_copy_u_d((v2i64)src4, 0);
-      out5 = __msa_copy_u_d((v2i64)src5, 0);
-      out6 = __msa_copy_u_d((v2i64)src6, 0);
-      out7 = __msa_copy_u_d((v2i64)src7, 0);
-
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-      SD4(out4, out5, out6, out7, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 4) {
-    for (cnt = (height / 4); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 2) {
-    for (cnt = (height / 2); cnt--;) {
-      LD_UB2(src, src_stride, src0, src1);
-      src += (2 * src_stride);
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-
-      SD(out0, dst);
-      dst += dst_stride;
-      SD(out1, dst);
-      dst += dst_stride;
-    }
-  }
-}
-
-static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
-                                  uint8_t *dst, int32_t dst_stride,
-                                  int32_t height, int32_t width) {
-  int32_t cnt, loop_cnt;
-  const uint8_t *src_tmp;
-  uint8_t *dst_tmp;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  for (cnt = (width >> 4); cnt--;) {
-    src_tmp = src;
-    dst_tmp = dst;
-
-    for (loop_cnt = (height >> 3); loop_cnt--;) {
-      LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
-             src7);
-      src_tmp += (8 * src_stride);
-
-      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
-             dst_stride);
-      dst_tmp += (8 * dst_stride);
-    }
-
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  if (0 == height % 12) {
-    for (cnt = (height / 12); cnt--;) {
-      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-      src += (8 * src_stride);
-      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
-      dst += (8 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 8) {
-    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
-  } else if (0 == height % 4) {
-    for (cnt = (height >> 2); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  }
-}
-
-static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  if (0 == height % 12) {
-    for (cnt = (height / 12); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 8) {
-    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
-  } else if (0 == height % 4) {
-    for (cnt = (height >> 2); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  }
-}
-
-static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride, int32_t height) {
-  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
-}
-
-void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride, int32_t w,
-                           int32_t h) {
-  switch (w) {
-    case 4: {
-      uint32_t cnt, tmp;
-      /* 1 word storage */
-      for (cnt = h; cnt--;) {
-        tmp = LW(src);
-        SW(tmp, dst);
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    }
-    case 8: {
-      copy_width8_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 16: {
-      copy_width16_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 32: {
-      copy_width32_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 64: {
-      copy_width64_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    default: {
-      uint32_t cnt;
-      for (cnt = h; cnt--;) {
-        memmove(dst, src, w);
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    }
-  }
-}
diff --git a/aom_dsp/mips/aom_convolve_msa.h b/aom_dsp/mips/aom_convolve_msa.h
deleted file mode 100644
index 852415c20..000000000
--- a/aom_dsp/mips/aom_convolve_msa.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/aom_filter.h"
-
-extern const uint8_t mc_filt_mask_arr[16 * 3];
-
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
-                            filt3)                                         \
-  ({                                                                       \
-    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
-                                                                           \
-    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
-    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
-    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
-    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
-    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
-                                                                           \
-    tmp_dpadd_0;                                                           \
-  })
-
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, mask3, filt0, filt1, filt2, filt3, \
-                                   out0, out1)                               \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v8i16 res0_m, res1_m, res2_m, res3_m;                                    \
-                                                                             \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);        \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);        \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);              \
-    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);        \
-    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);        \
-    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);              \
-    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                 \
-  }
-
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, mask3, filt0, filt1, filt2, filt3, \
-                                   out0, out1, out2, out3)                   \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                res0_m, res1_m, res2_m, res3_m);                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
-                res4_m, res5_m, res6_m, res7_m);                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
-                 res0_m, res1_m, res2_m, res3_m);                            \
-    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
-                 res4_m, res5_m, res6_m, res7_m);                            \
-    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,      \
-                res7_m, out0, out1, out2, out3);                             \
-  }
-
-#endif  // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
diff --git a/aom_dsp/mips/common_dspr2.c b/aom_dsp/mips/common_dspr2.c
deleted file mode 100644
index 00ab75dc3..000000000
--- a/aom_dsp/mips/common_dspr2.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
-uint8_t *aom_ff_cropTbl;
-
-void aom_dsputil_static_init(void) {
-  int i;
-
-  for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
-
-  for (i = 0; i < CROP_WIDTH; i++) {
-    aom_ff_cropTbl_a[i] = 0;
-    aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
-  }
-
-  aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
-}
-
-#endif
diff --git a/aom_dsp/mips/common_dspr2.h b/aom_dsp/mips/common_dspr2.h
deleted file mode 100644
index c42188d62..000000000
--- a/aom_dsp/mips/common_dspr2.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if HAVE_DSPR2
-#define CROP_WIDTH 512
-
-extern uint8_t *aom_ff_cropTbl;  // From "aom_dsp/mips/intrapred4_dspr2.c"
-
-static INLINE void prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
-}
-
-/* prefetch data for store */
-static INLINE void prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
-}
-
-static INLINE void prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__("pref   4,  0(%[src])   \n\t" : : [src] "r"(src));
-}
-
-/* prefetch data for store */
-static INLINE void prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__("pref   5,  0(%[dst])   \n\t" : : [dst] "r"(dst));
-}
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/aom_dsp/mips/convolve2_dspr2.c b/aom_dsp/mips/convolve2_dspr2.c
deleted file mode 100644
index 08bf1ab30..000000000
--- a/aom_dsp/mips/convolve2_dspr2.c
+++ /dev/null
@@ -1,1031 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_horiz_4_transposed_dspr2(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  int32_t Temp1, Temp2;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    dst_ptr = dst;
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
-        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [src] "r"(src), [dst_stride] "r"(dst_stride));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_bi_horiz_8_transposed_dspr2(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  uint32_t vector4a = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3;
-  uint32_t p1, p2, p3, p4;
-  uint8_t *odd_dst;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    dst_ptr = dst;
-    odd_dst = (dst_ptr + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                       \n\t"
-        "ulw              %[tp2],         4(%[src])                       \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
-        "ulw              %[tp3],         8(%[src])                       \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
-        "extp             %[Temp1],       $ac3,           31              \n\t"
-
-        /* even 2. pixel */
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "balign           %[tp3],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
-        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
-        "extp             %[p3],          $ac1,           31              \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        "lbux             %[Temp1],         %[p3](%[cm])                    "
-        "\n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
-        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
-        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "extp             %[Temp3],       $ac1,           31              \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
-        "extp             %[Temp1],       $ac2,           31              \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
-        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
-        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
-
-        /* store bytes */
-        "sb               %[p4],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[p2],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[p1],          0(%[odd_dst])                   \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
-          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
-          [odd_dst] "+r"(odd_dst)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_bi_horiz_16_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += 1;
-  }
-}
-
-static void convolve_bi_horiz_64_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += 1;
-  }
-}
-
-void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter, int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int sum = 0;
-
-      sum += src[x] * filter[3];
-      sum += src[x + 1] * filter[4];
-
-      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-    }
-
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter, int w,
-                         int h) {
-  uint32_t pos = 38;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-
-  switch (w) {
-    case 4:
-      convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                           filter, h);
-      break;
-    case 8:
-      convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                           filter, h);
-      break;
-    case 16:
-    case 32:
-      convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                            filter, h, (w / 16));
-      break;
-    case 64:
-      prefetch_load(src + 32);
-      convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                            filter, h);
-      break;
-    default:
-      convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
-                                   h);
-      break;
-  }
-}
-#endif
diff --git a/aom_dsp/mips/convolve2_horiz_dspr2.c b/aom_dsp/mips/convolve2_horiz_dspr2.c
deleted file mode 100644
index 097da73ca..000000000
--- a/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ /dev/null
@@ -1,681 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp4],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
-        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],      0(%[dst])                      \n\t"
-        "sb               %[p1],       1(%[dst])                      \n\t"
-        "sb               %[tp2],      2(%[dst])                      \n\t"
-        "sb               %[p2],       3(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=&r"(Temp4)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3;
-  uint32_t p1, p2, p3, p4;
-  uint32_t st0, st1;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
-        "ulw              %[tp3],      8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac1,           31             \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "sb               %[st0],      0(%[dst])                      \n\t"
-        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
-
-        "balign           %[tp3],      %[tp2],         3              \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-
-        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "sb               %[st1],      2(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
-        "sb               %[st0],      4(%[dst])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac1,           31             \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[st1],      1(%[dst])                      \n\t"
-        "sb               %[st0],      6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
-        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
-        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[p4],       3(%[dst])                      \n\t"
-        "sb               %[p2],       5(%[dst])                      \n\t"
-        "sb               %[p1],       7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
-          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
-                                       int32_t src_stride, uint8_t *dst_ptr,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h,
-                                       int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
-                                       int32_t src_stride, uint8_t *dst_ptr,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  uint32_t pos = 38;
-
-  assert(x_step_q4 == 16);
-
-  prefetch_load((const uint8_t *)filter_x);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h);
-      break;
-    case 8:
-      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h);
-      break;
-    case 16:
-      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
-      break;
-    case 32:
-      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
-      break;
-    case 64:
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filter_x, (int32_t)h);
-      break;
-    default:
-      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/aom_dsp/mips/convolve2_vert_dspr2.c b/aom_dsp/mips/convolve2_vert_dspr2.c
deleted file mode 100644
index 40abfd89e..000000000
--- a/aom_dsp/mips/convolve2_vert_dspr2.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     const int16_t *filter_y, int32_t w,
-                                     int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  uint32_t pos = 38;
-
-  assert(y_step_q4 == 16);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-    case 8:
-    case 16:
-    case 32:
-      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
-                               h);
-      break;
-    case 64:
-      prefetch_store(dst + 32);
-      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
-      break;
-    default:
-      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/aom_dsp/mips/convolve8_horiz_dspr2.c b/aom_dsp/mips/convolve8_horiz_dspr2.c
deleted file mode 100644
index f9c6879ab..000000000
--- a/aom_dsp/mips/convolve8_horiz_dspr2.c
+++ /dev/null
@@ -1,879 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                   uint8_t *dst, int32_t dst_stride,
-                                   const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4;
-  uint32_t n1, n2, n3, n4;
-  uint32_t tn1, tn2;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
-        "ulw              %[tn2],      8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
-        "balign           %[tn1],      %[tn2],         3              \n\t"
-        "balign           %[tn2],      %[tp2],         3              \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
-        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp4],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
-        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],      0(%[dst])                      \n\t"
-        "sb               %[tn1],      1(%[dst])                      \n\t"
-        "sb               %[tp2],      2(%[dst])                      \n\t"
-        "sb               %[n2],       3(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
-          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
-                                   uint8_t *dst, int32_t dst_stride,
-                                   const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4, n1;
-  uint32_t tn1, tn2, tn3;
-  uint32_t st0, st1;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
-        "ulw              %[tn2],      8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
-        "ulw              %[tn1],      12(%[src])                     \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
-        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac1,           31             \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "sb               %[st0],      0(%[dst])                      \n\t"
-        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
-
-        "balign           %[tn3],      %[tn1],         3              \n\t"
-        "balign           %[tn1],      %[tn2],         3              \n\t"
-        "balign           %[tn2],      %[tp2],         3              \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-
-        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "sb               %[st1],      2(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
-        "sb               %[st0],      4(%[dst])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
-        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
-        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac1,           31             \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
-        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[st1],      1(%[dst])                      \n\t"
-        "sb               %[st0],      6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
-        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
-        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[p4],       3(%[dst])                      \n\t"
-        "sb               %[p2],       5(%[dst])                      \n\t"
-        "sb               %[n1],       7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
-          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
-                                    uint8_t *dst_ptr, int32_t dst_stride,
-                                    const int16_t *filter_x0, int32_t h,
-                                    int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
-                                    uint8_t *dst_ptr, int32_t dst_stride,
-                                    const int16_t *filter_x0, int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    prefetch_load((const uint8_t *)filter_x);
-    src -= 3;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    /* prefetch data to cache memory */
-    prefetch_load(src);
-    prefetch_load(src + 32);
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
-                               (int32_t)dst_stride, filter_x, (int32_t)h);
-        break;
-      case 8:
-        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
-                               (int32_t)dst_stride, filter_x, (int32_t)h);
-        break;
-      case 16:
-        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
-        break;
-      case 32:
-        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
-        break;
-      case 64:
-        prefetch_load(src + 64);
-        prefetch_store(dst + 32);
-
-        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h);
-        break;
-      default:
-        aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
-#endif
diff --git a/aom_dsp/mips/convolve8_vert_dspr2.c b/aom_dsp/mips/convolve8_vert_dspr2.c
deleted file mode 100644
index 201e66427..000000000
--- a/aom_dsp/mips/convolve8_vert_dspr2.c
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                  uint8_t *dst, int32_t dst_stride,
-                                  const int16_t *filter_y, int32_t w,
-                                  int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
-                                   uint8_t *dst, int32_t dst_stride,
-                                   const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-    prefetch_store(dst + dst_stride + 32);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-      case 8:
-      case 16:
-      case 32:
-        convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
-        break;
-      case 64:
-        prefetch_store(dst + 32);
-        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
-        break;
-      default:
-        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
-
-#endif
diff --git a/aom_dsp/mips/convolve_common_dspr2.h b/aom_dsp/mips/convolve_common_dspr2.h
deleted file mode 100644
index e5d48a884..000000000
--- a/aom_dsp/mips/convolve_common_dspr2.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h);
-
-void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter, int w,
-                         int h);
-
-void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/aom_dsp/mips/intrapred16_dspr2.c b/aom_dsp/mips/intrapred16_dspr2.c
deleted file mode 100644
index 7c221ae89..000000000
--- a/aom_dsp/mips/intrapred16_dspr2.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-
-  (void)above;
-
-  __asm__ __volatile__(
-      "lb         %[tmp1],      (%[left])                    \n\t"
-      "lb         %[tmp2],      1(%[left])                   \n\t"
-      "lb         %[tmp3],      2(%[left])                   \n\t"
-      "lb         %[tmp4],      3(%[left])                   \n\t"
-      "lb         %[tmp5],      4(%[left])                   \n\t"
-      "lb         %[tmp6],      5(%[left])                   \n\t"
-      "lb         %[tmp7],      6(%[left])                   \n\t"
-      "lb         %[tmp8],      7(%[left])                   \n\t"
-      "lb         %[tmp9],      8(%[left])                   \n\t"
-      "lb         %[tmp10],     9(%[left])                   \n\t"
-      "lb         %[tmp11],     10(%[left])                  \n\t"
-      "lb         %[tmp12],     11(%[left])                  \n\t"
-      "lb         %[tmp13],     12(%[left])                  \n\t"
-      "lb         %[tmp14],     13(%[left])                  \n\t"
-      "lb         %[tmp15],     14(%[left])                  \n\t"
-      "lb         %[tmp16],     15(%[left])                  \n\t"
-
-      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
-      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
-      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
-      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
-      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
-      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
-      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
-      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
-      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
-      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
-      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
-      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
-      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
-      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
-      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
-      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
-
-      "sw         %[tmp1],      (%[dst])                     \n\t"
-      "sw         %[tmp1],      4(%[dst])                    \n\t"
-      "sw         %[tmp1],      8(%[dst])                    \n\t"
-      "sw         %[tmp1],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp2],      (%[dst])                     \n\t"
-      "sw         %[tmp2],      4(%[dst])                    \n\t"
-      "sw         %[tmp2],      8(%[dst])                    \n\t"
-      "sw         %[tmp2],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp3],      (%[dst])                     \n\t"
-      "sw         %[tmp3],      4(%[dst])                    \n\t"
-      "sw         %[tmp3],      8(%[dst])                    \n\t"
-      "sw         %[tmp3],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp4],      (%[dst])                     \n\t"
-      "sw         %[tmp4],      4(%[dst])                    \n\t"
-      "sw         %[tmp4],      8(%[dst])                    \n\t"
-      "sw         %[tmp4],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp5],      (%[dst])                     \n\t"
-      "sw         %[tmp5],      4(%[dst])                    \n\t"
-      "sw         %[tmp5],      8(%[dst])                    \n\t"
-      "sw         %[tmp5],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp6],      (%[dst])                     \n\t"
-      "sw         %[tmp6],      4(%[dst])                    \n\t"
-      "sw         %[tmp6],      8(%[dst])                    \n\t"
-      "sw         %[tmp6],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp7],      (%[dst])                     \n\t"
-      "sw         %[tmp7],      4(%[dst])                    \n\t"
-      "sw         %[tmp7],      8(%[dst])                    \n\t"
-      "sw         %[tmp7],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp8],      (%[dst])                     \n\t"
-      "sw         %[tmp8],      4(%[dst])                    \n\t"
-      "sw         %[tmp8],      8(%[dst])                    \n\t"
-      "sw         %[tmp8],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp9],      (%[dst])                     \n\t"
-      "sw         %[tmp9],      4(%[dst])                    \n\t"
-      "sw         %[tmp9],      8(%[dst])                    \n\t"
-      "sw         %[tmp9],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp10],     (%[dst])                     \n\t"
-      "sw         %[tmp10],     4(%[dst])                    \n\t"
-      "sw         %[tmp10],     8(%[dst])                    \n\t"
-      "sw         %[tmp10],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp11],     (%[dst])                     \n\t"
-      "sw         %[tmp11],     4(%[dst])                    \n\t"
-      "sw         %[tmp11],     8(%[dst])                    \n\t"
-      "sw         %[tmp11],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp12],     (%[dst])                     \n\t"
-      "sw         %[tmp12],     4(%[dst])                    \n\t"
-      "sw         %[tmp12],     8(%[dst])                    \n\t"
-      "sw         %[tmp12],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp13],     (%[dst])                     \n\t"
-      "sw         %[tmp13],     4(%[dst])                    \n\t"
-      "sw         %[tmp13],     8(%[dst])                    \n\t"
-      "sw         %[tmp13],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp14],     (%[dst])                     \n\t"
-      "sw         %[tmp14],     4(%[dst])                    \n\t"
-      "sw         %[tmp14],     8(%[dst])                    \n\t"
-      "sw         %[tmp14],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp15],     (%[dst])                     \n\t"
-      "sw         %[tmp15],     4(%[dst])                    \n\t"
-      "sw         %[tmp15],     8(%[dst])                    \n\t"
-      "sw         %[tmp15],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp16],     (%[dst])                     \n\t"
-      "sw         %[tmp16],     4(%[dst])                    \n\t"
-      "sw         %[tmp16],     8(%[dst])                    \n\t"
-      "sw         %[tmp16],     12(%[dst])                   \n\t"
-
-      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
-        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
-        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
-        [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
-        [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
-        [tmp16] "=&r"(tmp16)
-      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  int32_t expected_dc;
-  int32_t average;
-  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
-  int32_t above2, left2;
-
-  __asm__ __volatile__(
-      "lw              %[above1],           (%[above])                    \n\t"
-      "lw              %[above2],           4(%[above])                   \n\t"
-      "lw              %[left1],            (%[left])                     \n\t"
-      "lw              %[left2],            4(%[left])                    \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
-
-      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
-
-      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "lw              %[above1],           8(%[above])                   \n\t"
-      "lw              %[above2],           12(%[above])                  \n\t"
-      "lw              %[left1],            8(%[left])                    \n\t"
-      "lw              %[left2],            12(%[left])                   \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
-
-      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
-
-      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "addiu           %[average],          %[average],      16           \n\t"
-      "srl             %[tmp],              %[average],      16           \n\t"
-      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
-      "srl             %[expected_dc],      %[average],      5            \n\t"
-      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
-
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
-        [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
-        [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
-        [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
-        [expected_dc] "=&r"(expected_dc)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride));
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/intrapred4_dspr2.c b/aom_dsp/mips/intrapred4_dspr2.c
deleted file mode 100644
index 0a21979c7..000000000
--- a/aom_dsp/mips/intrapred4_dspr2.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int32_t tmp1, tmp2, tmp3, tmp4;
-  (void)above;
-
-  __asm__ __volatile__(
-      "lb         %[tmp1],      (%[left])                    \n\t"
-      "lb         %[tmp2],      1(%[left])                   \n\t"
-      "lb         %[tmp3],      2(%[left])                   \n\t"
-      "lb         %[tmp4],      3(%[left])                   \n\t"
-      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
-      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
-      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
-      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
-      "sw         %[tmp1],      (%[dst])                     \n\t"
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp2],      (%[dst])                     \n\t"
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp3],      (%[dst])                     \n\t"
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp4],      (%[dst])                     \n\t"
-
-      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
-        [tmp4] "=&r"(tmp4)
-      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int32_t expected_dc;
-  int32_t average;
-  int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
-
-  __asm__ __volatile__(
-      "lw              %[above_c],         (%[above])                    \n\t"
-      "lw              %[left_c],          (%[left])                     \n\t"
-
-      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
-      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
-      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
-      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
-
-      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
-      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
-      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
-      "addiu           %[average],         %[average],       4           \n\t"
-      "srl             %[tmp],             %[average],       16          \n\t"
-      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
-      "srl             %[expected_dc],     %[average],       3           \n\t"
-      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
-
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-      "add             %[dst],              %[dst],          %[stride]   \n\t"
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-      "add             %[dst],              %[dst],          %[stride]   \n\t"
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-      "add             %[dst],              %[dst],          %[stride]   \n\t"
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-
-      : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
-        [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
-        [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
-        [average] "=&r"(average), [tmp] "=&r"(tmp),
-        [expected_dc] "=&r"(expected_dc)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride));
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/intrapred8_dspr2.c b/aom_dsp/mips/intrapred8_dspr2.c
deleted file mode 100644
index d42a77c80..000000000
--- a/aom_dsp/mips/intrapred8_dspr2.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  (void)above;
-
-  __asm__ __volatile__(
-      "lb         %[tmp1],      (%[left])                   \n\t"
-      "lb         %[tmp2],      1(%[left])                  \n\t"
-      "lb         %[tmp3],      2(%[left])                  \n\t"
-      "lb         %[tmp4],      3(%[left])                  \n\t"
-      "lb         %[tmp5],      4(%[left])                  \n\t"
-      "lb         %[tmp6],      5(%[left])                  \n\t"
-      "lb         %[tmp7],      6(%[left])                  \n\t"
-      "lb         %[tmp8],      7(%[left])                  \n\t"
-
-      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
-      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
-      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
-      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
-      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
-      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
-      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
-      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
-
-      "sw         %[tmp1],      (%[dst])                    \n\t"
-      "sw         %[tmp1],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp2],      (%[dst])                    \n\t"
-      "sw         %[tmp2],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp3],      (%[dst])                    \n\t"
-      "sw         %[tmp3],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp4],      (%[dst])                    \n\t"
-      "sw         %[tmp4],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp5],      (%[dst])                    \n\t"
-      "sw         %[tmp5],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp6],      (%[dst])                    \n\t"
-      "sw         %[tmp6],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp7],      (%[dst])                    \n\t"
-      "sw         %[tmp7],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp8],      (%[dst])                    \n\t"
-      "sw         %[tmp8],      4(%[dst])                   \n\t"
-
-      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
-        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
-        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
-      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int32_t expected_dc;
-  int32_t average;
-  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
-  int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
-
-  __asm__ __volatile__(
-      "lw              %[above1],         (%[above])                      \n\t"
-      "lw              %[above2],         4(%[above])                     \n\t"
-      "lw              %[left1],          (%[left])                       \n\t"
-      "lw              %[left2],          4(%[left])                      \n\t"
-
-      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
-      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
-      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
-      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
-
-      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
-      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
-      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
-      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
-
-      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
-      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
-      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
-
-      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
-      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
-      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
-      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
-
-      "addiu           %[average],        %[average],       8             \n\t"
-
-      "srl             %[tmp],            %[average],       16            \n\t"
-      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
-      "srl             %[expected_dc],    %[average],       4             \n\t"
-      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
-
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
-        [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
-        [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
-        [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
-        [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
-        [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
-        [average] "=&r"(average), [tmp] "=&r"(tmp),
-        [expected_dc] "=&r"(expected_dc)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride));
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/intrapred_msa.c b/aom_dsp/mips/intrapred_msa.c
deleted file mode 100644
index 9f25cc1ca..000000000
--- a/aom_dsp/mips/intrapred_msa.c
+++ /dev/null
@@ -1,550 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
-  {                                             \
-    out0 = __msa_subs_u_h(out0, in0);           \
-    out1 = __msa_subs_u_h(out1, in1);           \
-  }
-
-static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t src_data;
-
-  src_data = LW(src);
-
-  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
-}
-
-static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t row;
-  uint32_t src_data1, src_data2;
-
-  src_data1 = LW(src);
-  src_data2 = LW(src + 4);
-
-  for (row = 8; row--;) {
-    SW(src_data1, dst);
-    SW(src_data2, (dst + 4));
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride) {
-  uint32_t row;
-  v16u8 src0;
-
-  src0 = LD_UB(src);
-
-  for (row = 16; row--;) {
-    ST_UB(src0, dst);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride) {
-  uint32_t row;
-  v16u8 src1, src2;
-
-  src1 = LD_UB(src);
-  src2 = LD_UB(src + 16);
-
-  for (row = 32; row--;) {
-    ST_UB2(src1, src2, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint32_t out0, out1, out2, out3;
-
-  out0 = src[0] * 0x01010101;
-  out1 = src[1] * 0x01010101;
-  out2 = src[2] * 0x01010101;
-  out3 = src[3] * 0x01010101;
-
-  SW4(out0, out1, out2, out3, dst, dst_stride);
-}
-
-static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-
-  out0 = src[0] * 0x0101010101010101ull;
-  out1 = src[1] * 0x0101010101010101ull;
-  out2 = src[2] * 0x0101010101010101ull;
-  out3 = src[3] * 0x0101010101010101ull;
-  out4 = src[4] * 0x0101010101010101ull;
-  out5 = src[5] * 0x0101010101010101ull;
-  out6 = src[6] * 0x0101010101010101ull;
-  out7 = src[7] * 0x0101010101010101ull;
-
-  SD4(out0, out1, out2, out3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(out4, out5, out6, out7, dst, dst_stride);
-}
-
-static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  uint8_t inp0, inp1, inp2, inp3;
-  v16u8 src0, src1, src2, src3;
-
-  for (row = 4; row--;) {
-    inp0 = src[0];
-    inp1 = src[1];
-    inp2 = src[2];
-    inp3 = src[3];
-    src += 4;
-
-    src0 = (v16u8)__msa_fill_b(inp0);
-    src1 = (v16u8)__msa_fill_b(inp1);
-    src2 = (v16u8)__msa_fill_b(inp2);
-    src3 = (v16u8)__msa_fill_b(inp3);
-
-    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  uint8_t inp0, inp1, inp2, inp3;
-  v16u8 src0, src1, src2, src3;
-
-  for (row = 8; row--;) {
-    inp0 = src[0];
-    inp1 = src[1];
-    inp2 = src[2];
-    inp3 = src[3];
-    src += 4;
-
-    src0 = (v16u8)__msa_fill_b(inp0);
-    src1 = (v16u8)__msa_fill_b(inp1);
-    src2 = (v16u8)__msa_fill_b(inp2);
-    src3 = (v16u8)__msa_fill_b(inp3);
-
-    ST_UB2(src0, src0, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src1, src1, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src2, src2, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src3, src3, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint32_t val0, val1;
-  v16i8 store, src = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LW(src_top);
-  val1 = LW(src_left);
-  INSERT_W2_SB(val0, val1, src);
-  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint32_t val0;
-  v16i8 store, data = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-
-  val0 = LW(src);
-  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
-  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
-  uint32_t out;
-  const v16i8 store = __msa_ldi_b(128);
-
-  out = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint64_t val0, val1;
-  v16i8 store;
-  v16u8 src = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LD(src_top);
-  val1 = LD(src_left);
-  INSERT_D2_UB(val0, val1, src);
-  sum_h = __msa_hadd_u_h(src, src);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint64_t val0;
-  v16i8 store;
-  v16u8 data = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LD(src);
-  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
-  sum_h = __msa_hadd_u_h(data, data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
-  uint64_t out;
-  const v16i8 store = __msa_ldi_b(128);
-
-  out = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(out, out, out, out, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  v16u8 top, left, out;
-  v8u16 sum_h, sum_top, sum_left;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  top = LD_UB(src_top);
-  left = LD_UB(src_left);
-  HADD_UB2_UH(top, left, sum_top, sum_left);
-  sum_h = sum_top + sum_left;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  v16u8 data, out;
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  data = LD_UB(src);
-  sum_h = __msa_hadd_u_h(data, data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
-  const v16u8 out = (v16u8)__msa_ldi_b(128);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t row;
-  v16u8 top0, top1, left0, left1, out;
-  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  LD_UB2(src_top, 16, top0, top1);
-  LD_UB2(src_left, 16, left0, left1);
-  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
-  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
-  sum_h = sum_top0 + sum_top1;
-  sum_h += sum_left0 + sum_left1;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  v16u8 data0, data1, out;
-  v8u16 sum_h, sum_data0, sum_data1;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  LD_UB2(src, 16, data0, data1);
-  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
-  sum_h = sum_data0 + sum_data1;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
-  uint32_t row;
-  const v16u8 out = (v16u8)__msa_ldi_b(128);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_4x4_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_8x8_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_16x16_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_32x32_msa(above, dst, y_stride);
-}
-
-void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_4x4_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_8x8_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_16x16_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
-}
-
-void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_4x4_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_8x8_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_16x16_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_32x32_msa(dst, y_stride);
-}
diff --git a/aom_dsp/mips/loopfilter_16_msa.c b/aom_dsp/mips/loopfilter_16_msa.c
deleted file mode 100644
index 38a10e9b2..000000000
--- a/aom_dsp/mips/loopfilter_16_msa.c
+++ /dev/null
@@ -1,1488 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16u8 zero = { 0 };
-
-  /* load vector elements */
-  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
-
-    return 1;
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
-    filter48 += (4 * 16);
-    ST_UB2(q1_out, q2_out, filter48, 16);
-    filter48 += (2 * 16);
-    ST_UB(flat, filter48);
-
-    return 0;
-  }
-}
-
-void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
-  v16u8 flat, flat2, filter8;
-  v16i8 zero = { 0 };
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
-  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
-  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
-  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
-  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
-  v8i16 l_out, r_out;
-
-  flat = LD_UB(filter48 + 96);
-
-  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
-  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-  if (__msa_test_bz_v(flat2)) {
-    LD_UB4(filter48, 16, p2, p1, p0, q0);
-    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
-    src -= 3 * pitch;
-    ST_UB4(p2, p1, p0, q0, src, pitch);
-    src += (4 * pitch);
-    ST_UB2(q1, q2, src, pitch);
-  } else {
-    src -= 7 * pitch;
-
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
-               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
-               p2_r_in, p1_r_in, p0_r_in);
-
-    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
-    tmp0_r = p7_r_in << 3;
-    tmp0_r -= p7_r_in;
-    tmp0_r += p6_r_in;
-    tmp0_r += q0_r_in;
-    tmp1_r = p6_r_in + p5_r_in;
-    tmp1_r += p4_r_in;
-    tmp1_r += p3_r_in;
-    tmp1_r += p2_r_in;
-    tmp1_r += p1_r_in;
-    tmp1_r += p0_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
-               p5_l_in, p4_l_in);
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
-               p1_l_in, p0_l_in);
-    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
-
-    tmp0_l = p7_l_in << 3;
-    tmp0_l -= p7_l_in;
-    tmp0_l += p6_l_in;
-    tmp0_l += q0_l_in;
-    tmp1_l = p6_l_in + p5_l_in;
-    tmp1_l += p4_l_in;
-    tmp1_l += p3_l_in;
-    tmp1_l += p2_l_in;
-    tmp1_l += p1_l_in;
-    tmp1_l += p0_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
-    ST_UB(p6, src);
-    src += pitch;
-
-    /* p5 */
-    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
-    tmp0_r = p5_r_in - p6_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
-    tmp0_l = p5_l_in - p6_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
-    ST_UB(p5, src);
-    src += pitch;
-
-    /* p4 */
-    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
-    tmp0_r = p4_r_in - p5_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
-
-    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
-    tmp0_l = p4_l_in - p5_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
-    ST_UB(p4, src);
-    src += pitch;
-
-    /* p3 */
-    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
-    tmp0_r = p3_r_in - p4_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
-    tmp0_l = p3_l_in - p4_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
-    ST_UB(p3, src);
-    src += pitch;
-
-    /* p2 */
-    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
-    filter8 = LD_UB(filter48);
-    tmp0_r = p2_r_in - p3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
-    tmp0_l = p2_l_in - p3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* p1 */
-    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
-    filter8 = LD_UB(filter48 + 16);
-    tmp0_r = p1_r_in - p2_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
-    tmp0_l = p1_l_in - p2_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* p0 */
-    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
-    filter8 = LD_UB(filter48 + 32);
-    tmp0_r = p0_r_in - p1_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
-    tmp0_l = p0_l_in - p1_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q0 */
-    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
-    filter8 = LD_UB(filter48 + 48);
-    tmp0_r = q7_r_in - p0_r_in;
-    tmp0_r += q0_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
-    tmp0_l = q7_l_in - p0_l_in;
-    tmp0_l += q0_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q1 */
-    filter8 = LD_UB(filter48 + 64);
-    tmp0_r = q7_r_in - q0_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p6_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q0_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p6_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q2 */
-    filter8 = LD_UB(filter48 + 80);
-    tmp0_r = q7_r_in - q1_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p5_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q1_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p5_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q3 */
-    tmp0_r = q7_r_in - q2_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p4_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q2_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p4_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
-    ST_UB(q3, src);
-    src += pitch;
-
-    /* q4 */
-    tmp0_r = q7_r_in - q3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p3_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p3_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
-    ST_UB(q4, src);
-    src += pitch;
-
-    /* q5 */
-    tmp0_r = q7_r_in - q4_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p2_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q4_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p2_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
-    ST_UB(q5, src);
-    src += pitch;
-
-    /* q6 */
-    tmp0_r = q7_r_in - q5_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p1_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q5_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p1_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
-    ST_UB(q6, src);
-  }
-}
-
-static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
-                                        const uint8_t *b_limit_ptr,
-                                        const uint8_t *limit_ptr,
-                                        const uint8_t *thresh_ptr,
-                                        int32_t count) {
-  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
-  uint8_t early_exit = 0;
-
-  (void)count;
-
-  early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
-                                        limit_ptr, thresh_ptr);
-
-  if (0 == early_exit) {
-    aom_hz_lpf_t16_16w(src, pitch, filter48);
-  }
-}
-
-static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit_ptr,
-                                   const uint8_t *limit_ptr,
-                                   const uint8_t *thresh_ptr, int32_t count) {
-  if (1 == count) {
-    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
-    uint64_t dword0, dword1;
-    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
-    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
-    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-    v16u8 p0_filter16, p1_filter16;
-    v8i16 p2_filter8, p1_filter8, p0_filter8;
-    v8i16 q0_filter8, q1_filter8, q2_filter8;
-    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
-    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
-    v16i8 zero = { 0 };
-    v8u16 tmp0, tmp1, tmp2;
-
-    /* load vector elements */
-    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-    limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-                 mask, flat);
-    AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-    AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
-                       q1_out);
-
-    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-    if (__msa_test_bz_v(flat)) {
-      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
-    } else {
-      /* convert 8 bit input data into 16 bit */
-      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
-                 q3_r);
-      AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
-                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
-
-      /* convert 16 bit output data into 8 bit */
-      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
-                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
-      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
-
-      /* store pixel values */
-      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
-      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
-      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
-      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
-      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
-      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
-
-      /* load 16 vector elements */
-      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
-      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
-
-      AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-      if (__msa_test_bz_v(flat2)) {
-        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
-        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
-
-        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
-        SD(q1_d, src + pitch);
-        SD(q2_d, src + 2 * pitch);
-      } else {
-        /* LSB(right) 8 pixel operation */
-        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
-                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
-                   q7_r);
-
-        tmp0 = p7_r << 3;
-        tmp0 -= p7_r;
-        tmp0 += p6_r;
-        tmp0 += q0_r;
-
-        src -= 7 * pitch;
-
-        /* calculation of p6 and p5 */
-        tmp1 = p6_r + p5_r + p4_r + p3_r;
-        tmp1 += (p2_r + p1_r + p0_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp0 = p5_r - p6_r + q1_r - p7_r;
-        tmp1 += tmp0;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of p4 and p3 */
-        tmp0 = p4_r - p5_r + q2_r - p7_r;
-        tmp2 = p3_r - p4_r + q3_r - p7_r;
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of p2 and p1 */
-        tmp0 = p2_r - p3_r + q4_r - p7_r;
-        tmp2 = p1_r - p2_r + q5_r - p7_r;
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of p0 and q0 */
-        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
-        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of q1 and q2 */
-        tmp0 = q7_r - q0_r + q1_r - p6_r;
-        tmp2 = q7_r - q1_r + q2_r - p5_r;
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of q3 and q4 */
-        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
-        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of q5 and q6 */
-        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
-        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-      }
-    }
-  } else {
-    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
-                                count);
-  }
-}
-
-void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
-                               const uint8_t *b_limit_ptr,
-                               const uint8_t *limit_ptr,
-                               const uint8_t *thresh_ptr) {
-  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
-}
-
-void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
-                                    const uint8_t *b_limit_ptr,
-                                    const uint8_t *limit_ptr,
-                                    const uint8_t *thresh_ptr) {
-  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
-}
-
-static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
-                                   uint8_t *output, int32_t out_pitch) {
-  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
-  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-
-  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
-         p1_org, p0_org);
-  /* 8x8 transpose */
-  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
-                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
-  /* 8x8 transpose */
-  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
-             tmp0, tmp1, tmp2, tmp3);
-  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
-  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
-  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
-  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
-  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
-
-  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
-  output += (8 * out_pitch);
-  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
-}
-
-static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
-                                   uint8_t *output, int32_t out_pitch) {
-  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-
-  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
-  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
-                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
-  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
-}
-
-static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
-                            int32_t out_pitch) {
-  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
-  v4i32 tmp2, tmp3;
-
-  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
-  input += (8 * in_pitch);
-  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
-
-  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
-                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
-                      p5, p4, p3, p2, p1, p0);
-
-  /* transpose 16x8 matrix into 8x16 */
-  /* total 8 intermediate register and 32 instructions */
-  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
-  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
-  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
-  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
-  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
-  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
-  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
-  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
-
-  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
-  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
-  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
-
-  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
-  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
-  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
-
-  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
-  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
-  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
-  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
-  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
-  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
-  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
-  output += (8 * out_pitch);
-  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
-}
-
-int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
-                                uint8_t *src_org, int32_t pitch_org,
-                                const uint8_t *b_limit_ptr,
-                                const uint8_t *limit_ptr,
-                                const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v16i8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3;
-
-  /* load vector elements */
-  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-  if (__msa_test_bz_v(flat)) {
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
-    return 1;
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    /* convert 16 bit output data into 8 bit */
-    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
-    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
-    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
-    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
-    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
-    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
-    filter48 += (4 * 16);
-    ST_UB2(q1_out, q2_out, filter48, 16);
-    filter48 += (2 * 16);
-    ST_UB(flat, filter48);
-
-    return 0;
-  }
-}
-
-int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                          uint8_t *filter48) {
-  v16i8 zero = { 0 };
-  v16u8 filter8, flat, flat2;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
-  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
-  v8u16 tmp0_r, tmp1_r;
-  v8i16 r_out;
-
-  flat = LD_UB(filter48 + 6 * 16);
-
-  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
-
-  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-  if (__msa_test_bz_v(flat2)) {
-    v8i16 vec0, vec1, vec2, vec3, vec4;
-
-    LD_UB4(filter48, 16, p2, p1, p0, q0);
-    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
-    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
-
-    src_org -= 3;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
-
-    return 1;
-  } else {
-    src -= 7 * 16;
-
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
-               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
-               p2_r_in, p1_r_in, p0_r_in);
-    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
-    tmp0_r = p7_r_in << 3;
-    tmp0_r -= p7_r_in;
-    tmp0_r += p6_r_in;
-    tmp0_r += q0_r_in;
-    tmp1_r = p6_r_in + p5_r_in;
-    tmp1_r += p4_r_in;
-    tmp1_r += p3_r_in;
-    tmp1_r += p2_r_in;
-    tmp1_r += p1_r_in;
-    tmp1_r += p0_r_in;
-    tmp1_r += tmp0_r;
-
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
-    ST8x1_UB(p6, src);
-    src += 16;
-
-    /* p5 */
-    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
-    tmp0_r = p5_r_in - p6_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
-    ST8x1_UB(p5, src);
-    src += 16;
-
-    /* p4 */
-    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
-    tmp0_r = p4_r_in - p5_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
-    ST8x1_UB(p4, src);
-    src += 16;
-
-    /* p3 */
-    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
-    tmp0_r = p3_r_in - p4_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
-    ST8x1_UB(p3, src);
-    src += 16;
-
-    /* p2 */
-    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
-    filter8 = LD_UB(filter48);
-    tmp0_r = p2_r_in - p3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* p1 */
-    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
-    filter8 = LD_UB(filter48 + 16);
-    tmp0_r = p1_r_in - p2_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* p0 */
-    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
-    filter8 = LD_UB(filter48 + 32);
-    tmp0_r = p0_r_in - p1_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q0 */
-    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
-    filter8 = LD_UB(filter48 + 48);
-    tmp0_r = q7_r_in - p0_r_in;
-    tmp0_r += q0_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q1 */
-    filter8 = LD_UB(filter48 + 64);
-    tmp0_r = q7_r_in - q0_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p6_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q2 */
-    filter8 = LD_UB(filter48 + 80);
-    tmp0_r = q7_r_in - q1_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p5_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q3 */
-    tmp0_r = q7_r_in - q2_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p4_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
-    ST8x1_UB(q3, src);
-    src += 16;
-
-    /* q4 */
-    tmp0_r = q7_r_in - q3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p3_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
-    ST8x1_UB(q4, src);
-    src += 16;
-
-    /* q5 */
-    tmp0_r = q7_r_in - q4_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p2_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
-    ST8x1_UB(q5, src);
-    src += 16;
-
-    /* q6 */
-    tmp0_r = q7_r_in - q5_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p1_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
-    ST8x1_UB(q6, src);
-
-    return 0;
-  }
-}
-
-void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
-                             const uint8_t *b_limit_ptr,
-                             const uint8_t *limit_ptr,
-                             const uint8_t *thresh_ptr) {
-  uint8_t early_exit = 0;
-  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
-  uint8_t *filter48 = &transposed_input[16 * 16];
-
-  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
-
-  early_exit =
-      aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
-                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
-
-  if (0 == early_exit) {
-    early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
-                                   &filter48[0]);
-
-    if (0 == early_exit) {
-      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
-    }
-  }
-}
-
-int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
-                                 uint8_t *src_org, int32_t pitch,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16i8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
-
-  /* load vector elements */
-  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
-
-    src_org -= 2;
-    ST4x8_UB(vec2, vec3, src_org, pitch);
-    src_org += 8 * pitch;
-    ST4x8_UB(vec4, vec5, src_org, pitch);
-
-    return 1;
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
-    filter48 += (4 * 16);
-    ST_UB2(q1_out, q2_out, filter48, 16);
-    filter48 += (2 * 16);
-    ST_UB(flat, filter48);
-
-    return 0;
-  }
-}
-
-int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                           uint8_t *filter48) {
-  v16u8 flat, flat2, filter8;
-  v16i8 zero = { 0 };
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
-  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
-  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
-  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
-  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
-  v8i16 l_out, r_out;
-
-  flat = LD_UB(filter48 + 6 * 16);
-
-  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
-
-  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-  if (__msa_test_bz_v(flat2)) {
-    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-
-    LD_UB4(filter48, 16, p2, p1, p0, q0);
-    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
-    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
-    ILVRL_B2_SH(q2, q1, vec2, vec5);
-
-    src_org -= 3;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
-
-    return 1;
-  } else {
-    src -= 7 * 16;
-
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
-               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
-               p2_r_in, p1_r_in, p0_r_in);
-    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
-    tmp0_r = p7_r_in << 3;
-    tmp0_r -= p7_r_in;
-    tmp0_r += p6_r_in;
-    tmp0_r += q0_r_in;
-    tmp1_r = p6_r_in + p5_r_in;
-    tmp1_r += p4_r_in;
-    tmp1_r += p3_r_in;
-    tmp1_r += p2_r_in;
-    tmp1_r += p1_r_in;
-    tmp1_r += p0_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
-               p5_l_in, p4_l_in);
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
-               p1_l_in, p0_l_in);
-    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
-
-    tmp0_l = p7_l_in << 3;
-    tmp0_l -= p7_l_in;
-    tmp0_l += p6_l_in;
-    tmp0_l += q0_l_in;
-    tmp1_l = p6_l_in + p5_l_in;
-    tmp1_l += p4_l_in;
-    tmp1_l += p3_l_in;
-    tmp1_l += p2_l_in;
-    tmp1_l += p1_l_in;
-    tmp1_l += p0_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
-    ST_UB(p6, src);
-    src += 16;
-
-    /* p5 */
-    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
-    tmp0_r = p5_r_in - p6_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
-    tmp0_l = p5_l_in - p6_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
-    ST_UB(p5, src);
-    src += 16;
-
-    /* p4 */
-    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
-    tmp0_r = p4_r_in - p5_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
-    tmp0_l = p4_l_in - p5_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
-    ST_UB(p4, src);
-    src += 16;
-
-    /* p3 */
-    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
-    tmp0_r = p3_r_in - p4_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
-    tmp0_l = p3_l_in - p4_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
-    ST_UB(p3, src);
-    src += 16;
-
-    /* p2 */
-    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
-    filter8 = LD_UB(filter48);
-    tmp0_r = p2_r_in - p3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
-    tmp0_l = p2_l_in - p3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* p1 */
-    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
-    filter8 = LD_UB(filter48 + 16);
-    tmp0_r = p1_r_in - p2_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
-    tmp0_l = p1_l_in - p2_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* p0 */
-    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
-    filter8 = LD_UB(filter48 + 32);
-    tmp0_r = p0_r_in - p1_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
-    tmp0_l = p0_l_in - p1_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q0 */
-    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
-    filter8 = LD_UB(filter48 + 48);
-    tmp0_r = q7_r_in - p0_r_in;
-    tmp0_r += q0_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
-    tmp0_l = q7_l_in - p0_l_in;
-    tmp0_l += q0_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q1 */
-    filter8 = LD_UB(filter48 + 64);
-    tmp0_r = q7_r_in - q0_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p6_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q0_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p6_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q2 */
-    filter8 = LD_UB(filter48 + 80);
-    tmp0_r = q7_r_in - q1_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p5_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q1_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p5_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q3 */
-    tmp0_r = q7_r_in - q2_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p4_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q2_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p4_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
-    ST_UB(q3, src);
-    src += 16;
-
-    /* q4 */
-    tmp0_r = q7_r_in - q3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p3_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p3_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
-    ST_UB(q4, src);
-    src += 16;
-
-    /* q5 */
-    tmp0_r = q7_r_in - q4_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p2_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q4_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p2_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
-    ST_UB(q5, src);
-    src += 16;
-
-    /* q6 */
-    tmp0_r = q7_r_in - q5_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p1_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q5_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p1_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
-    ST_UB(q6, src);
-
-    return 0;
-  }
-}
-
-void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
-                                  const uint8_t *b_limit_ptr,
-                                  const uint8_t *limit_ptr,
-                                  const uint8_t *thresh_ptr) {
-  uint8_t early_exit = 0;
-  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
-  uint8_t *filter48 = &transposed_input[16 * 16];
-
-  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
-
-  early_exit =
-      aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
-                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
-
-  if (0 == early_exit) {
-    early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
-                                    &filter48[0]);
-
-    if (0 == early_exit) {
-      transpose_16x16(transposed_input, 16, (src - 8), pitch);
-    }
-  }
-}
diff --git a/aom_dsp/mips/loopfilter_4_msa.c b/aom_dsp/mips/loopfilter_4_msa.c
deleted file mode 100644
index dc0a97764..000000000
--- a/aom_dsp/mips/loopfilter_4_msa.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
-                              const uint8_t *b_limit_ptr,
-                              const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr) {
-  uint64_t p1_d, p0_d, q0_d, q1_d;
-  v16u8 mask, hev, flat, thresh, b_limit, limit;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
-
-  /* load vector elements */
-  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
-}
-
-void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit0_ptr,
-                                   const uint8_t *limit0_ptr,
-                                   const uint8_t *thresh0_ptr,
-                                   const uint8_t *b_limit1_ptr,
-                                   const uint8_t *limit1_ptr,
-                                   const uint8_t *thresh1_ptr) {
-  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-
-  /* load vector elements */
-  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
-  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
-  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
-
-  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
-  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
-  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
-
-  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
-  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
-  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-
-  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
-}
-
-void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
-                            const uint8_t *b_limit_ptr,
-                            const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr) {
-  v16u8 mask, hev, flat, limit, thresh, b_limit;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v8i16 vec0, vec1, vec2, vec3;
-
-  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
-                     q3);
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
-  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-
-  src -= 2;
-  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
-  src += 4 * pitch;
-  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-}
-
-void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
-                                 const uint8_t *b_limit0_ptr,
-                                 const uint8_t *limit0_ptr,
-                                 const uint8_t *thresh0_ptr,
-                                 const uint8_t *b_limit1_ptr,
-                                 const uint8_t *limit1_ptr,
-                                 const uint8_t *thresh1_ptr) {
-  v16u8 mask, hev, flat;
-  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-
-  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
-  LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
-         row14, row15);
-
-  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
-                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
-                      p1, p0, q0, q1, q2, q3);
-
-  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
-  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
-  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
-
-  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
-  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
-  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
-
-  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
-  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
-  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
-  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
-  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
-  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
-
-  src -= 2;
-
-  ST4x8_UB(tmp2, tmp3, src, pitch);
-  src += (8 * pitch);
-  ST4x8_UB(tmp4, tmp5, src, pitch);
-}
diff --git a/aom_dsp/mips/loopfilter_8_msa.c b/aom_dsp/mips/loopfilter_8_msa.c
deleted file mode 100644
index dc203e79c..000000000
--- a/aom_dsp/mips/loopfilter_8_msa.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
-                              const uint8_t *b_limit_ptr,
-                              const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr) {
-  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
-  v16u8 mask, hev, flat, thresh, b_limit, limit;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
-  v16i8 zero = { 0 };
-
-  /* load vector elements */
-  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-  if (__msa_test_bz_v(flat)) {
-    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
-                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
-                q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
-    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
-
-    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
-    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
-
-    src -= 3 * pitch;
-
-    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
-    src += (4 * pitch);
-    SD(q1_d, src);
-    src += pitch;
-    SD(q2_d, src);
-  }
-}
-
-void aom_lpf_horizontal_8_dual_msa(
-    uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
-    const uint8_t *thresh1) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16u8 zero = { 0 };
-
-  /* load vector elements */
-  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh0);
-  tmp = (v16u8)__msa_fill_b(*thresh1);
-  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
-
-  b_limit = (v16u8)__msa_fill_b(*b_limit0);
-  tmp = (v16u8)__msa_fill_b(*b_limit1);
-  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
-
-  limit = (v16u8)__msa_fill_b(*limit0);
-  tmp = (v16u8)__msa_fill_b(*limit1);
-  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    src -= 3 * pitch;
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
-    src += (4 * pitch);
-    ST_UB2(q1_out, q2_out, src, pitch);
-    src += (2 * pitch);
-  }
-}
-
-void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
-                            const uint8_t *b_limit_ptr,
-                            const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p1_out, p0_out, q0_out, q1_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v16u8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3, vec4;
-
-  /* load vector elements */
-  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
-                     q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-  if (__msa_test_bz_v(flat)) {
-    /* Store 4 pixels p1-_q1 */
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-
-    src -= 2;
-    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
-    src += 4 * pitch;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
-                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    /* Store 6 pixels p2-_q2 */
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
-
-    src -= 3;
-    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec4, 0, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec4, 4, src + 4, pitch);
-  }
-}
-
-void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
-                                 const uint8_t *b_limit0, const uint8_t *limit0,
-                                 const uint8_t *thresh0,
-                                 const uint8_t *b_limit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  uint8_t *temp_src;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p1_out, p0_out, q0_out, q1_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16u8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-
-  temp_src = src - 4;
-
-  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
-  temp_src += (8 * pitch);
-  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
-
-  /* transpose 16x8 matrix into 8x16 */
-  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
-                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
-                      q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh0);
-  vec0 = (v8i16)__msa_fill_b(*thresh1);
-  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
-
-  b_limit = (v16u8)__msa_fill_b(*b_limit0);
-  vec0 = (v8i16)__msa_fill_b(*b_limit1);
-  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
-
-  limit = (v16u8)__msa_fill_b(*limit0);
-  vec0 = (v8i16)__msa_fill_b(*limit1);
-  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
-
-    src -= 2;
-    ST4x8_UB(vec2, vec3, src, pitch);
-    src += 8 * pitch;
-    ST4x8_UB(vec4, vec5, src, pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-
-    /* filter8 */
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
-    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
-    ILVRL_B2_SH(q2, q1, vec2, vec5);
-
-    src -= 3;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec2, 0, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec2, 4, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec5, 0, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec5, 4, src + 4, pitch);
-  }
-}
diff --git a/aom_dsp/mips/loopfilter_filters_dspr2.c b/aom_dsp/mips/loopfilter_filters_dspr2.c
deleted file mode 100644
index 8c41278be..000000000
--- a/aom_dsp/mips/loopfilter_filters_dspr2.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
-                                const uint8_t *blimit, const uint8_t *limit,
-                                const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask;
-  uint32_t hev;
-  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
-  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s);
-
-  /* loop filter designed to work using chars so that we can make maximum use
-     of 8 bit simd instructions. */
-  for (i = 0; i < 2; i++) {
-    sm1 = s - (pitch << 2);
-    s0 = sm1 + pitch;
-    s1 = s0 + pitch;
-    s2 = s - pitch;
-    s3 = s;
-    s4 = s + pitch;
-    s5 = s4 + pitch;
-    s6 = s5 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p1],  (%[s1])    \n\t"
-        "lw     %[p2],  (%[s2])    \n\t"
-        "lw     %[p3],  (%[s3])    \n\t"
-        "lw     %[p4],  (%[s4])    \n\t"
-
-        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-       mask will be zero and filtering is not needed */
-    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      __asm__ __volatile__(
-          "lw       %[pm1], (%[sm1])   \n\t"
-          "lw       %[p0],  (%[s0])    \n\t"
-          "lw       %[p5],  (%[s5])    \n\t"
-          "lw       %[p6],  (%[s6])    \n\t"
-
-          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
-          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
-
-      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
-                            p6, thresh_vec, &hev, &mask);
-
-      /* if mask == 0 do filtering is not needed */
-      if (mask) {
-        /* filtering */
-        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
-
-        __asm__ __volatile__(
-            "sw     %[p1],  (%[s1])    \n\t"
-            "sw     %[p2],  (%[s2])    \n\t"
-            "sw     %[p3],  (%[s3])    \n\t"
-            "sw     %[p4],  (%[s4])    \n\t"
-
-            :
-            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
-              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-      }
-    }
-
-    s = s + 4;
-  }
-}
-
-void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
-                              const uint8_t *blimit, const uint8_t *limit,
-                              const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask, hev;
-  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
-  uint8_t *s1, *s2, *s3, *s4;
-  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s + pitch);
-
-  for (i = 0; i < 2; i++) {
-    s1 = s;
-    s2 = s + pitch;
-    s3 = s2 + pitch;
-    s4 = s3 + pitch;
-    s = s4 + pitch;
-
-    /* load quad-byte vectors
-     * memory is 4 byte aligned
-     */
-    p2 = *((uint32_t *)(s1 - 4));
-    p6 = *((uint32_t *)(s1));
-    p1 = *((uint32_t *)(s2 - 4));
-    p5 = *((uint32_t *)(s2));
-    p0 = *((uint32_t *)(s3 - 4));
-    p4 = *((uint32_t *)(s3));
-    pm1 = *((uint32_t *)(s4 - 4));
-    p3 = *((uint32_t *)(s4));
-
-    /* transpose pm1, p0, p1, p2 */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
-        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
-
-        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
-        "append         %[p1],      %[sec3],    16          \n\t"
-        "append         %[pm1],     %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
-          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose p3, p4, p5, p6 */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
-
-        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
-        "append         %[p5],      %[sec3],    16          \n\t"
-        "append         %[p3],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
-          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-     * mask will be zero and filtering is not needed
-     */
-    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
-                            p6, thresh_vec, &hev, &mask);
-
-      /* if mask == 0 do filtering is not needed */
-      if (mask) {
-        /* filtering */
-        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
-
-        /* unpack processed 4x4 neighborhood
-         * don't use transpose on output data
-         * because memory isn't aligned
-         */
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s4])    \n\t"
-            "sb     %[p3],   0(%[s4])    \n\t"
-            "sb     %[p2],  -1(%[s4])    \n\t"
-            "sb     %[p1],  -2(%[s4])    \n\t"
-
-            :
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-              [s4] "r"(s4));
-
-        __asm__ __volatile__(
-            "srl    %[p4],  %[p4],  8     \n\t"
-            "srl    %[p3],  %[p3],  8     \n\t"
-            "srl    %[p2],  %[p2],  8     \n\t"
-            "srl    %[p1],  %[p1],  8     \n\t"
-
-            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
-            :);
-
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s3])    \n\t"
-            "sb     %[p3],   0(%[s3])    \n\t"
-            "sb     %[p2],  -1(%[s3])    \n\t"
-            "sb     %[p1],  -2(%[s3])    \n\t"
-
-            : [p1] "+r"(p1)
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
-
-        __asm__ __volatile__(
-            "srl    %[p4],  %[p4],  8     \n\t"
-            "srl    %[p3],  %[p3],  8     \n\t"
-            "srl    %[p2],  %[p2],  8     \n\t"
-            "srl    %[p1],  %[p1],  8     \n\t"
-
-            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
-            :);
-
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s2])    \n\t"
-            "sb     %[p3],   0(%[s2])    \n\t"
-            "sb     %[p2],  -1(%[s2])    \n\t"
-            "sb     %[p1],  -2(%[s2])    \n\t"
-
-            :
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-              [s2] "r"(s2));
-
-        __asm__ __volatile__(
-            "srl    %[p4],  %[p4],  8     \n\t"
-            "srl    %[p3],  %[p3],  8     \n\t"
-            "srl    %[p2],  %[p2],  8     \n\t"
-            "srl    %[p1],  %[p1],  8     \n\t"
-
-            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
-            :);
-
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s1])    \n\t"
-            "sb     %[p3],   0(%[s1])    \n\t"
-            "sb     %[p2],  -1(%[s1])    \n\t"
-            "sb     %[p1],  -2(%[s1])    \n\t"
-
-            :
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-              [s1] "r"(s1));
-      }
-    }
-  }
-}
-
-void aom_lpf_horizontal_4_dual_dspr2(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_horizontal_8_dual_dspr2(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
-                                   const uint8_t *limit0,
-                                   const uint8_t *thresh0,
-                                   const uint8_t *blimit1,
-                                   const uint8_t *limit1,
-                                   const uint8_t *thresh1) {
-  aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
-                                   const uint8_t *limit0,
-                                   const uint8_t *thresh0,
-                                   const uint8_t *blimit1,
-                                   const uint8_t *limit1,
-                                   const uint8_t *thresh1) {
-  aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh) {
-  aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
-  aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/loopfilter_filters_dspr2.h b/aom_dsp/mips/loopfilter_filters_dspr2.h
deleted file mode 100644
index 28f0dc35a..000000000
--- a/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ /dev/null
@@ -1,736 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* inputs & outputs are quad-byte vectors */
-static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
-                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
-  int32_t aom_filter_l, aom_filter_r;
-  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
-  int32_t subr_r, subr_l;
-  uint32_t t1, t2, HWM, t3;
-  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
-  int32_t vps1, vps0, vqs0, vqs1;
-  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
-  uint32_t N128;
-
-  N128 = 0x80808080;
-  t1 = 0x03000300;
-  t2 = 0x04000400;
-  t3 = 0x01000100;
-  HWM = 0xFF00FF00;
-
-  vps0 = (*ps0) ^ N128;
-  vps1 = (*ps1) ^ N128;
-  vqs0 = (*qs0) ^ N128;
-  vqs1 = (*qs1) ^ N128;
-
-  /* use halfword pairs instead quad-bytes because of accuracy */
-  vps0_l = vps0 & HWM;
-  vps0_r = vps0 << 8;
-  vps0_r = vps0_r & HWM;
-
-  vps1_l = vps1 & HWM;
-  vps1_r = vps1 << 8;
-  vps1_r = vps1_r & HWM;
-
-  vqs0_l = vqs0 & HWM;
-  vqs0_r = vqs0 << 8;
-  vqs0_r = vqs0_r & HWM;
-
-  vqs1_l = vqs1 & HWM;
-  vqs1_r = vqs1 << 8;
-  vqs1_r = vqs1_r & HWM;
-
-  mask_l = mask & HWM;
-  mask_r = mask << 8;
-  mask_r = mask_r & HWM;
-
-  hev_l = hev & HWM;
-  hev_r = hev << 8;
-  hev_r = hev_r & HWM;
-
-  __asm__ __volatile__(
-      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
-
-      /* qs0 - ps0 */
-      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
-      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
-
-      /* aom_filter &= hev; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
-
-      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-
-      /* aom_filter &= mask; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
-
-      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
-        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
-        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
-      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
-        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
-        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
-        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
-        [HWM] "r"(HWM));
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  __asm__ __volatile__(
-      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
-
-      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
-      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
-      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
-
-      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
-      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
-
-      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
-
-      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
-      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
-      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
-
-      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
-      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
-        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
-        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
-        [vqs0_r] "+r"(vqs0_r)
-      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
-        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
-
-  __asm__ __volatile__(
-      /* (aom_filter += 1) >>= 1 */
-      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
-      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
-
-      /* aom_filter &= ~hev; */
-      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
-
-      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
-      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
-      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
-
-      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
-      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
-        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
-        [vqs1_r] "+r"(vqs1_r)
-      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
-
-  /* Create quad-bytes from halfword pairs */
-  vqs0_l = vqs0_l & HWM;
-  vqs1_l = vqs1_l & HWM;
-  vps0_l = vps0_l & HWM;
-  vps1_l = vps1_l & HWM;
-
-  __asm__ __volatile__(
-      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
-      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
-      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
-      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
-
-      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
-        [vqs0_r] "+r"(vqs0_r)
-      :);
-
-  vqs0 = vqs0_l | vqs0_r;
-  vqs1 = vqs1_l | vqs1_r;
-  vps0 = vps0_l | vps0_r;
-  vps1 = vps1_l | vps1_r;
-
-  *ps0 = vps0 ^ N128;
-  *ps1 = vps1 ^ N128;
-  *qs0 = vqs0 ^ N128;
-  *qs1 = vqs1 ^ N128;
-}
-
-static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
-                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
-                                 uint32_t *p1_f0, uint32_t *p0_f0,
-                                 uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t aom_filter_l, aom_filter_r;
-  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
-  int32_t subr_r, subr_l;
-  uint32_t t1, t2, HWM, t3;
-  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
-  int32_t vps1, vps0, vqs0, vqs1;
-  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
-  uint32_t N128;
-
-  N128 = 0x80808080;
-  t1 = 0x03000300;
-  t2 = 0x04000400;
-  t3 = 0x01000100;
-  HWM = 0xFF00FF00;
-
-  vps0 = (ps0) ^ N128;
-  vps1 = (ps1) ^ N128;
-  vqs0 = (qs0) ^ N128;
-  vqs1 = (qs1) ^ N128;
-
-  /* use halfword pairs instead quad-bytes because of accuracy */
-  vps0_l = vps0 & HWM;
-  vps0_r = vps0 << 8;
-  vps0_r = vps0_r & HWM;
-
-  vps1_l = vps1 & HWM;
-  vps1_r = vps1 << 8;
-  vps1_r = vps1_r & HWM;
-
-  vqs0_l = vqs0 & HWM;
-  vqs0_r = vqs0 << 8;
-  vqs0_r = vqs0_r & HWM;
-
-  vqs1_l = vqs1 & HWM;
-  vqs1_r = vqs1 << 8;
-  vqs1_r = vqs1_r & HWM;
-
-  mask_l = mask & HWM;
-  mask_r = mask << 8;
-  mask_r = mask_r & HWM;
-
-  hev_l = hev & HWM;
-  hev_r = hev << 8;
-  hev_r = hev_r & HWM;
-
-  __asm__ __volatile__(
-      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
-
-      /* qs0 - ps0 */
-      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
-      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
-
-      /* aom_filter &= hev; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
-
-      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-
-      /* aom_filter &= mask; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
-
-      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
-        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
-        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
-      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
-        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
-        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
-        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
-        [HWM] "r"(HWM));
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  __asm__ __volatile__(
-      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
-
-      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
-      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
-      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
-
-      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
-      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
-
-      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
-
-      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
-      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
-      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
-
-      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
-      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
-        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
-        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
-        [vqs0_r] "+r"(vqs0_r)
-      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
-        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
-
-  __asm__ __volatile__(
-      /* (aom_filter += 1) >>= 1 */
-      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
-      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
-
-      /* aom_filter &= ~hev; */
-      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
-
-      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
-      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
-      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
-
-      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
-      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
-        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
-        [vqs1_r] "+r"(vqs1_r)
-      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
-
-  /* Create quad-bytes from halfword pairs */
-  vqs0_l = vqs0_l & HWM;
-  vqs1_l = vqs1_l & HWM;
-  vps0_l = vps0_l & HWM;
-  vps1_l = vps1_l & HWM;
-
-  __asm__ __volatile__(
-      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
-      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
-      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
-      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
-
-      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
-        [vqs0_r] "+r"(vqs0_r)
-      :);
-
-  vqs0 = vqs0_l | vqs0_r;
-  vqs1 = vqs1_l | vqs1_r;
-  vps0 = vps0_l | vps0_r;
-  vps1 = vps1_l | vps1_r;
-
-  *p0_f0 = vps0 ^ N128;
-  *p1_f0 = vps1 ^ N128;
-  *q0_f0 = vqs0 ^ N128;
-  *q1_f0 = vqs1 ^ N128;
-}
-
-static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
-                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
-                                  uint32_t *oq2, uint32_t *oq3) {
-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
-  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-  uint32_t res_op2, res_op1, res_op0;
-  uint32_t res_oq0, res_oq1, res_oq2;
-  uint32_t tmp;
-  uint32_t add_p210_q012;
-  uint32_t u32Four = 0x00040004;
-
-  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
-  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
-  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
-  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
-  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
-  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
-
-  __asm__ __volatile__(
-      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
-
-      "shll.ph    %[tmp],            %[p3],             1                \n\t"
-      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
-      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
-      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
-      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
-      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
-      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
-      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
-      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
-      "shll.ph    %[tmp],            %[q3],             1                \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
-      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
-      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
-      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
-      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
-      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
-      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
-
-      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
-        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
-        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
-        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
-        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
-
-  *op2 = res_op2;
-  *op1 = res_op1;
-  *op0 = res_op0;
-  *oq0 = res_oq0;
-  *oq1 = res_oq1;
-  *oq2 = res_oq2;
-}
-
-static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
-                                   uint32_t p0, uint32_t q0, uint32_t q1,
-                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
-                                   uint32_t *op1_f1, uint32_t *op0_f1,
-                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
-                                   uint32_t *oq2_f1) {
-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
-  uint32_t res_op2, res_op1, res_op0;
-  uint32_t res_oq0, res_oq1, res_oq2;
-  uint32_t tmp;
-  uint32_t add_p210_q012;
-  uint32_t u32Four = 0x00040004;
-
-  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
-  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
-  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
-  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
-  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
-  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
-
-  __asm__ __volatile__(
-      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
-
-      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
-      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
-      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
-      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
-      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
-      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
-      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
-      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
-      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
-      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
-      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
-      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
-      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
-      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
-      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
-      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
-
-      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
-        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
-        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
-        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
-        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
-
-  *op2_f1 = res_op2;
-  *op1_f1 = res_op1;
-  *op0_f1 = res_op0;
-  *oq0_f1 = res_oq0;
-  *oq1_f1 = res_oq1;
-  *oq2_f1 = res_oq2;
-}
-
-static INLINE void wide_mbfilter_dspr2(
-    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
-    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
-    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
-    uint32_t *oq7) {
-  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
-  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
-  uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
-  uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
-  uint32_t tmp;
-  uint32_t add_p6toq6;
-  uint32_t u32Eight = 0x00080008;
-
-  __asm__ __volatile__(
-      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
-         which is used most of the time */
-      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
-
-      : [add_p6toq6] "=&r"(add_p6toq6)
-      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
-        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
-        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
-        [u32Eight] "r"(u32Eight));
-
-  __asm__ __volatile__(
-      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
-                                   p3 + p2 + p1 + p0 + q0, 4) */
-      "shll.ph       %[tmp],            %[p7],            3               \n\t"
-      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
-      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
-      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
-
-      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
-                                   p2 + p1 + p0 + q0 + q1, 4) */
-      "shll.ph       %[tmp],            %[p7],            2               \n\t"
-      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
-      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
-      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
-      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
-
-      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
-                                   p1 + p0 + q0 + q1 + q2, 4) */
-      "shll.ph       %[tmp],            %[p7],            2               \n\t"
-      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
-      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
-      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
-
-      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
-                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
-      "shll.ph       %[tmp],            %[p7],            2               \n\t"
-      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
-      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
-      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
-      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
-      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
-
-      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
-                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
-      "shll.ph       %[tmp],            %[p7],            1               \n\t"
-      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
-      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
-      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
-      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
-
-      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
-                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
-      "shll.ph       %[tmp],            %[p7],            1               \n\t"
-      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
-      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
-      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
-
-      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
-      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
-      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
-      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
-
-      : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
-        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
-        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
-        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
-      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
-        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
-        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
-        [add_p6toq6] "r"(add_p6toq6));
-
-  *op6 = res_op6;
-  *op5 = res_op5;
-  *op4 = res_op4;
-  *op3 = res_op3;
-  *op2 = res_op2;
-  *op1 = res_op1;
-  *op0 = res_op0;
-
-  __asm__ __volatile__(
-      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
-                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
-      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
-      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
-      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
-
-      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
-                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
-      "shll.ph       %[tmp],            %[q7],            1               \n\t"
-      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
-      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
-
-      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
-                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
-      "shll.ph       %[tmp],            %[q7],            1               \n\t"
-      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
-      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
-      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
-
-      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
-                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
-      "shll.ph       %[tmp],            %[q7],            2               \n\t"
-      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
-      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
-      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
-      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
-
-      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
-                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
-      "shll.ph       %[tmp],            %[q7],            2               \n\t"
-      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
-      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
-
-      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
-                                   q5 * 2 + q6 + q7 * 6, 4) */
-      "shll.ph       %[tmp],            %[q7],            2               \n\t"
-      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
-      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
-      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
-
-      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
-                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
-      "shll.ph       %[tmp],            %[q7],            3               \n\t"
-      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
-      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
-
-      : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
-        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
-        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
-        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
-      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
-        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
-        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
-        [add_p6toq6] "r"(add_p6toq6));
-
-  *oq0 = res_oq0;
-  *oq1 = res_oq1;
-  *oq2 = res_oq2;
-  *oq3 = res_oq3;
-  *oq4 = res_oq4;
-  *oq5 = res_oq5;
-  *oq6 = res_oq6;
-}
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/aom_dsp/mips/loopfilter_macros_dspr2.h b/aom_dsp/mips/loopfilter_macros_dspr2.h
deleted file mode 100644
index 62295d69d..000000000
--- a/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-#define STORE_F0()                                                       \
-  {                                                                      \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s4])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s4])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s4])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s4])           \n\t"                    \
-                                                                         \
-        :                                                                \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
-          [p1_f0] "r"(p1_f0), [s4] "r"(s4));                             \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
-                                                                         \
-        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
-          [p1_f0] "+r"(p1_f0)                                            \
-        :);                                                              \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s3])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s3])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s3])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s3])           \n\t"                    \
-                                                                         \
-        : [p1_f0] "+r"(p1_f0)                                            \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3),          \
-          [p0_f0] "r"(p0_f0));                                           \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
-                                                                         \
-        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
-          [p1_f0] "+r"(p1_f0)                                            \
-        :);                                                              \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s2])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s2])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s2])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s2])           \n\t"                    \
-                                                                         \
-        :                                                                \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
-          [p1_f0] "r"(p1_f0), [s2] "r"(s2));                             \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
-                                                                         \
-        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
-          [p1_f0] "+r"(p1_f0)                                            \
-        :);                                                              \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s1])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s1])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s1])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s1])           \n\t"                    \
-                                                                         \
-        :                                                                \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
-          [p1_f0] "r"(p1_f0), [s1] "r"(s1));                             \
-  }
-
-#define STORE_F1()                                                             \
-  {                                                                            \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_r],     2(%[s4])           \n\t"                          \
-        "sb     %[q1_r],     1(%[s4])           \n\t"                          \
-        "sb     %[q0_r],     0(%[s4])           \n\t"                          \
-        "sb     %[p0_r],    -1(%[s4])           \n\t"                          \
-        "sb     %[p1_r],    -2(%[s4])           \n\t"                          \
-        "sb     %[p2_r],    -3(%[s4])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
-          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "srl    %[q2_r],    %[q2_r],    16      \n\t"                          \
-        "srl    %[q1_r],    %[q1_r],    16      \n\t"                          \
-        "srl    %[q0_r],    %[q0_r],    16      \n\t"                          \
-        "srl    %[p0_r],    %[p0_r],    16      \n\t"                          \
-        "srl    %[p1_r],    %[p1_r],    16      \n\t"                          \
-        "srl    %[p2_r],    %[p2_r],    16      \n\t"                          \
-                                                                               \
-        : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r),             \
-          [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r)              \
-        :);                                                                    \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_r],     2(%[s3])           \n\t"                          \
-        "sb     %[q1_r],     1(%[s3])           \n\t"                          \
-        "sb     %[q0_r],     0(%[s3])           \n\t"                          \
-        "sb     %[p0_r],    -1(%[s3])           \n\t"                          \
-        "sb     %[p1_r],    -2(%[s3])           \n\t"                          \
-        "sb     %[p2_r],    -3(%[s3])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
-          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_l],     2(%[s2])           \n\t"                          \
-        "sb     %[q1_l],     1(%[s2])           \n\t"                          \
-        "sb     %[q0_l],     0(%[s2])           \n\t"                          \
-        "sb     %[p0_l],    -1(%[s2])           \n\t"                          \
-        "sb     %[p1_l],    -2(%[s2])           \n\t"                          \
-        "sb     %[p2_l],    -3(%[s2])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
-          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "srl    %[q2_l],    %[q2_l],    16      \n\t"                          \
-        "srl    %[q1_l],    %[q1_l],    16      \n\t"                          \
-        "srl    %[q0_l],    %[q0_l],    16      \n\t"                          \
-        "srl    %[p0_l],    %[p0_l],    16      \n\t"                          \
-        "srl    %[p1_l],    %[p1_l],    16      \n\t"                          \
-        "srl    %[p2_l],    %[p2_l],    16      \n\t"                          \
-                                                                               \
-        : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l),             \
-          [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l)              \
-        :);                                                                    \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_l],     2(%[s1])           \n\t"                          \
-        "sb     %[q1_l],     1(%[s1])           \n\t"                          \
-        "sb     %[q0_l],     0(%[s1])           \n\t"                          \
-        "sb     %[p0_l],    -1(%[s1])           \n\t"                          \
-        "sb     %[p1_l],    -2(%[s1])           \n\t"                          \
-        "sb     %[p2_l],    -3(%[s1])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
-          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
-  }
-
-#define STORE_F2()                                                 \
-  {                                                                \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_r],     6(%[s4])           \n\t"              \
-        "sb     %[q5_r],     5(%[s4])           \n\t"              \
-        "sb     %[q4_r],     4(%[s4])           \n\t"              \
-        "sb     %[q3_r],     3(%[s4])           \n\t"              \
-        "sb     %[q2_r],     2(%[s4])           \n\t"              \
-        "sb     %[q1_r],     1(%[s4])           \n\t"              \
-        "sb     %[q0_r],     0(%[s4])           \n\t"              \
-        "sb     %[p0_r],    -1(%[s4])           \n\t"              \
-        "sb     %[p1_r],    -2(%[s4])           \n\t"              \
-        "sb     %[p2_r],    -3(%[s4])           \n\t"              \
-        "sb     %[p3_r],    -4(%[s4])           \n\t"              \
-        "sb     %[p4_r],    -5(%[s4])           \n\t"              \
-        "sb     %[p5_r],    -6(%[s4])           \n\t"              \
-        "sb     %[p6_r],    -7(%[s4])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
-          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
-          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
-          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
-          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4));       \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "srl    %[q6_r],    %[q6_r],    16      \n\t"              \
-        "srl    %[q5_r],    %[q5_r],    16      \n\t"              \
-        "srl    %[q4_r],    %[q4_r],    16      \n\t"              \
-        "srl    %[q3_r],    %[q3_r],    16      \n\t"              \
-        "srl    %[q2_r],    %[q2_r],    16      \n\t"              \
-        "srl    %[q1_r],    %[q1_r],    16      \n\t"              \
-        "srl    %[q0_r],    %[q0_r],    16      \n\t"              \
-        "srl    %[p0_r],    %[p0_r],    16      \n\t"              \
-        "srl    %[p1_r],    %[p1_r],    16      \n\t"              \
-        "srl    %[p2_r],    %[p2_r],    16      \n\t"              \
-        "srl    %[p3_r],    %[p3_r],    16      \n\t"              \
-        "srl    %[p4_r],    %[p4_r],    16      \n\t"              \
-        "srl    %[p5_r],    %[p5_r],    16      \n\t"              \
-        "srl    %[p6_r],    %[p6_r],    16      \n\t"              \
-                                                                   \
-        : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
-          [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
-          [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
-          [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
-          [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r)                     \
-        :);                                                        \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_r],     6(%[s3])           \n\t"              \
-        "sb     %[q5_r],     5(%[s3])           \n\t"              \
-        "sb     %[q4_r],     4(%[s3])           \n\t"              \
-        "sb     %[q3_r],     3(%[s3])           \n\t"              \
-        "sb     %[q2_r],     2(%[s3])           \n\t"              \
-        "sb     %[q1_r],     1(%[s3])           \n\t"              \
-        "sb     %[q0_r],     0(%[s3])           \n\t"              \
-        "sb     %[p0_r],    -1(%[s3])           \n\t"              \
-        "sb     %[p1_r],    -2(%[s3])           \n\t"              \
-        "sb     %[p2_r],    -3(%[s3])           \n\t"              \
-        "sb     %[p3_r],    -4(%[s3])           \n\t"              \
-        "sb     %[p4_r],    -5(%[s3])           \n\t"              \
-        "sb     %[p5_r],    -6(%[s3])           \n\t"              \
-        "sb     %[p6_r],    -7(%[s3])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
-          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
-          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
-          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
-          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3));       \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_l],     6(%[s2])           \n\t"              \
-        "sb     %[q5_l],     5(%[s2])           \n\t"              \
-        "sb     %[q4_l],     4(%[s2])           \n\t"              \
-        "sb     %[q3_l],     3(%[s2])           \n\t"              \
-        "sb     %[q2_l],     2(%[s2])           \n\t"              \
-        "sb     %[q1_l],     1(%[s2])           \n\t"              \
-        "sb     %[q0_l],     0(%[s2])           \n\t"              \
-        "sb     %[p0_l],    -1(%[s2])           \n\t"              \
-        "sb     %[p1_l],    -2(%[s2])           \n\t"              \
-        "sb     %[p2_l],    -3(%[s2])           \n\t"              \
-        "sb     %[p3_l],    -4(%[s2])           \n\t"              \
-        "sb     %[p4_l],    -5(%[s2])           \n\t"              \
-        "sb     %[p5_l],    -6(%[s2])           \n\t"              \
-        "sb     %[p6_l],    -7(%[s2])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
-          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
-          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
-          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
-          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2));       \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "srl    %[q6_l],    %[q6_l],    16     \n\t"               \
-        "srl    %[q5_l],    %[q5_l],    16     \n\t"               \
-        "srl    %[q4_l],    %[q4_l],    16     \n\t"               \
-        "srl    %[q3_l],    %[q3_l],    16     \n\t"               \
-        "srl    %[q2_l],    %[q2_l],    16     \n\t"               \
-        "srl    %[q1_l],    %[q1_l],    16     \n\t"               \
-        "srl    %[q0_l],    %[q0_l],    16     \n\t"               \
-        "srl    %[p0_l],    %[p0_l],    16     \n\t"               \
-        "srl    %[p1_l],    %[p1_l],    16     \n\t"               \
-        "srl    %[p2_l],    %[p2_l],    16     \n\t"               \
-        "srl    %[p3_l],    %[p3_l],    16     \n\t"               \
-        "srl    %[p4_l],    %[p4_l],    16     \n\t"               \
-        "srl    %[p5_l],    %[p5_l],    16     \n\t"               \
-        "srl    %[p6_l],    %[p6_l],    16     \n\t"               \
-                                                                   \
-        : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
-          [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
-          [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
-          [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
-          [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l)                     \
-        :);                                                        \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_l],     6(%[s1])           \n\t"              \
-        "sb     %[q5_l],     5(%[s1])           \n\t"              \
-        "sb     %[q4_l],     4(%[s1])           \n\t"              \
-        "sb     %[q3_l],     3(%[s1])           \n\t"              \
-        "sb     %[q2_l],     2(%[s1])           \n\t"              \
-        "sb     %[q1_l],     1(%[s1])           \n\t"              \
-        "sb     %[q0_l],     0(%[s1])           \n\t"              \
-        "sb     %[p0_l],    -1(%[s1])           \n\t"              \
-        "sb     %[p1_l],    -2(%[s1])           \n\t"              \
-        "sb     %[p2_l],    -3(%[s1])           \n\t"              \
-        "sb     %[p3_l],    -4(%[s1])           \n\t"              \
-        "sb     %[p4_l],    -5(%[s1])           \n\t"              \
-        "sb     %[p5_l],    -6(%[s1])           \n\t"              \
-        "sb     %[p6_l],    -7(%[s1])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
-          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
-          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
-          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
-          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1));       \
-  }
-
-#define PACK_LEFT_0TO3()                                              \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                     \
-        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                     \
-        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                     \
-        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                     \
-        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                     \
-        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                     \
-        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                     \
-        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                     \
-                                                                      \
-        : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
-          [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
-          [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l)                      \
-        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
-          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
-  }
-
-#define PACK_LEFT_4TO7()                                              \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                     \
-        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                     \
-        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                     \
-        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                     \
-        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                     \
-        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                     \
-        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                     \
-        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                     \
-                                                                      \
-        : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
-          [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
-          [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l)                      \
-        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
-          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
-  }
-
-#define PACK_RIGHT_0TO3()                                             \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                      \
-        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                     \
-        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                     \
-        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                     \
-        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                     \
-        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                     \
-        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                     \
-        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                     \
-                                                                      \
-        : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
-          [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
-          [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r)                      \
-        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
-          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
-  }
-
-#define PACK_RIGHT_4TO7()                                             \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                     \
-        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                     \
-        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                     \
-        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                     \
-        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                     \
-        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                     \
-        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                     \
-        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                     \
-                                                                      \
-        : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
-          [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
-          [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r)                      \
-        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
-          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
-  }
-
-#define COMBINE_LEFT_RIGHT_0TO2()                                         \
-  {                                                                       \
-    __asm__ __volatile__(                                                 \
-        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"              \
-        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"              \
-        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"              \
-        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"              \
-        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"              \
-        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"              \
-                                                                          \
-        : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
-          [q1] "=&r"(q1), [q2] "=&r"(q2)                                  \
-        : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l),           \
-          [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r),           \
-          [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l),           \
-          [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r));          \
-  }
-
-#define COMBINE_LEFT_RIGHT_3TO6()                                         \
-  {                                                                       \
-    __asm__ __volatile__(                                                 \
-        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"              \
-        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"              \
-        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"              \
-        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"              \
-        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"              \
-        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"              \
-        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"              \
-        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"              \
-                                                                          \
-        : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
-          [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6)  \
-        : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),           \
-          [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r),           \
-          [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l),           \
-          [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l),           \
-          [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),           \
-          [q6_r] "r"(q6_r));                                              \
-  }
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/aom_dsp/mips/loopfilter_masks_dspr2.h b/aom_dsp/mips/loopfilter_masks_dspr2.h
deleted file mode 100644
index a0f57f386..000000000
--- a/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* processing 4 pixels at the same time
- * compute hev and mask in the same function */
-static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                         uint32_t p1, uint32_t p0, uint32_t p3,
-                                         uint32_t p2, uint32_t q0, uint32_t q1,
-                                         uint32_t q2, uint32_t q3,
-                                         uint32_t thresh, uint32_t *hev,
-                                         uint32_t *mask) {
-  uint32_t c, r, r3, r_k;
-  uint32_t s1, s2, s3;
-  uint32_t ones = 0xFFFFFFFF;
-  uint32_t hev1;
-
-  __asm__ __volatile__(
-      /* mask |= (abs(p3 - p2) > limit) */
-      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
-      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   $0,        %[c]         \n\t"
-
-      /* mask |= (abs(p2 - p1) > limit) */
-      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
-      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      /* mask |= (abs(p1 - p0) > limit)
-       * hev  |= (abs(p1 - p0) > thresh)
-       */
-      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
-      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
-      "or             %[r3],  $0,        %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      /* mask |= (abs(q1 - q0) > limit)
-       * hev  |= (abs(q1 - q0) > thresh)
-       */
-      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
-      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
-      "or             %[r3],  %[r3],     %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      /* mask |= (abs(q2 - q1) > limit) */
-      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
-      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-      "sll            %[r3],    %[r3],    24          \n\t"
-
-      /* mask |= (abs(q3 - q2) > limit) */
-      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
-      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
-      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
-        [thresh] "r"(thresh));
-
-  __asm__ __volatile__(
-      /* abs(p0 - q0) */
-      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
-      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
-      "wrdsp          %[r3]                           \n\t"
-      "or             %[s1],  %[r_k],    %[c]         \n\t"
-
-      /* abs(p1 - q1) */
-      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
-      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
-      "pick.qb        %[hev1], %[ones],  $0           \n\t"
-      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
-      "or             %[s2],   %[r_k],   %[c]         \n\t"
-
-      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
-      "shrl.qb        %[s2],   %[s2],     1           \n\t"
-      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
-      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
-      "or             %[r],    %[r],      %[c]        \n\t"
-      "sll            %[r],    %[r],      24          \n\t"
-
-      "wrdsp          %[r]                            \n\t"
-      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
-        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
-        [ones] "r"(ones), [flimit] "r"(flimit));
-
-  *hev = hev1;
-  *mask = s2;
-}
-
-static INLINE void filter_hev_mask_flatmask4_dspr2(
-    uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
-    uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
-    uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
-  uint32_t c, r, r3, r_k, r_flat;
-  uint32_t s1, s2, s3;
-  uint32_t ones = 0xFFFFFFFF;
-  uint32_t flat_thresh = 0x01010101;
-  uint32_t hev1;
-  uint32_t flat1;
-
-  __asm__ __volatile__(
-      /* mask |= (abs(p3 - p2) > limit) */
-      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
-      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       $0,             %[c]         \n\t"
-
-      /* mask |= (abs(p2 - p1) > limit) */
-      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
-      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-
-      /* mask |= (abs(p1 - p0) > limit)
-       * hev  |= (abs(p1 - p0) > thresh)
-       * flat |= (abs(p1 - p0) > thresh)
-       */
-      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
-      "or             %[r3],      $0,             %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  $0,             %[c]         \n\t"
-
-      /* mask |= (abs(q1 - q0) > limit)
-       * hev  |= (abs(q1 - q0) > thresh)
-       * flat |= (abs(q1 - q0) > thresh)
-       */
-      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
-      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
-      "or             %[r3],      %[r3],          %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(p0 - p2) > thresh) */
-      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
-      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q0 - q2) > thresh) */
-      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
-      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(p3 - p0) > thresh) */
-      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q3 - q0) > thresh) */
-      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
-      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-      "sll            %[r_flat],  %[r_flat],      24           \n\t"
-      /* look at stall here */
-      "wrdsp          %[r_flat]                                \n\t"
-      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
-
-      /* mask |= (abs(q2 - q1) > limit) */
-      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
-      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-      "sll            %[r3],      %[r3],          24           \n\t"
-
-      /* mask |= (abs(q3 - q2) > limit) */
-      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
-      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
-        [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
-      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
-        [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
-
-  __asm__ __volatile__(
-      /* abs(p0 - q0) */
-      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
-      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
-      "wrdsp          %[r3]                           \n\t"
-      "or             %[s1],  %[r_k],    %[c]         \n\t"
-
-      /* abs(p1 - q1) */
-      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
-      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
-      "pick.qb        %[hev1], %[ones],  $0           \n\t"
-      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
-      "or             %[s2],   %[r_k],   %[c]         \n\t"
-
-      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
-      "shrl.qb        %[s2],   %[s2],     1           \n\t"
-      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
-      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
-      "or             %[r],    %[r],      %[c]        \n\t"
-      "sll            %[r],    %[r],      24          \n\t"
-
-      "wrdsp          %[r]                            \n\t"
-      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
-        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
-        [ones] "r"(ones), [flimit] "r"(flimit));
-
-  *hev = hev1;
-  *mask = s2;
-  *flat = flat1;
-}
-
-static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
-                             uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
-                             uint32_t q3, uint32_t q4, uint32_t *flat2) {
-  uint32_t c, r, r_k, r_flat;
-  uint32_t ones = 0xFFFFFFFF;
-  uint32_t flat_thresh = 0x01010101;
-  uint32_t flat1, flat3;
-
-  __asm__ __volatile__(
-      /* flat |= (abs(p4 - p0) > thresh) */
-      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
-      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
-      "or             %[r_k], %[r_k],          %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
-      "or             %[r],   $0,              %[c]         \n\t"
-
-      /* flat |= (abs(q4 - q0) > thresh) */
-      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
-      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
-      "or             %[r_k],   %[r_k],          %[c]      \n\t"
-      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
-      "or             %[r],     %[r],            %[c]      \n\t"
-      "sll            %[r],     %[r],            24        \n\t"
-      "wrdsp          %[r]                                 \n\t"
-      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
-
-      /* flat |= (abs(p1 - p0) > thresh) */
-      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  $0,             %[c]         \n\t"
-
-      /* flat |= (abs(q1 - q0) > thresh) */
-      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
-      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
-      "or             %[r_k],    %[r_k],          %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
-      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
-
-      /* flat |= (abs(p0 - p2) > thresh) */
-      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
-      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q0 - q2) > thresh) */
-      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
-      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(p3 - p0) > thresh) */
-      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q3 - q0) > thresh) */
-      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
-      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-      "sll            %[r_flat],  %[r_flat],      24           \n\t"
-      "wrdsp          %[r_flat]                                \n\t"
-      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
-      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
-      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
-        [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
-      : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
-        [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
-        [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
-
-  *flat2 = flat1;
-}
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/aom_dsp/mips/loopfilter_mb_dspr2.c b/aom_dsp/mips/loopfilter_mb_dspr2.c
deleted file mode 100644
index b67ccfe9d..000000000
--- a/aom_dsp/mips/loopfilter_mb_dspr2.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
-                                const uint8_t *blimit, const uint8_t *limit,
-                                const uint8_t *thresh) {
-  uint32_t mask;
-  uint32_t hev, flat;
-  uint8_t i;
-  uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
-  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
-  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s);
-
-  for (i = 0; i < 2; i++) {
-    sp3 = s - (pitch << 2);
-    sp2 = sp3 + pitch;
-    sp1 = sp2 + pitch;
-    sp0 = sp1 + pitch;
-    sq0 = s;
-    sq1 = s + pitch;
-    sq2 = sq1 + pitch;
-    sq3 = sq2 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p3],      (%[sp3])    \n\t"
-        "lw     %[p2],      (%[sp2])    \n\t"
-        "lw     %[p1],      (%[sp1])    \n\t"
-        "lw     %[p0],      (%[sp0])    \n\t"
-        "lw     %[q0],      (%[sq0])    \n\t"
-        "lw     %[q1],      (%[sq1])    \n\t"
-        "lw     %[q2],      (%[sq2])    \n\t"
-        "lw     %[q3],      (%[sq3])    \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
-        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
-          [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    if ((flat == 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      __asm__ __volatile__(
-          "sw       %[p1_f0],   (%[sp1])    \n\t"
-          "sw       %[p0_f0],   (%[sp0])    \n\t"
-          "sw       %[q0_f0],   (%[sq0])    \n\t"
-          "sw       %[q1_f0],   (%[sq1])    \n\t"
-
-          :
-          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-            [sq1] "r"(sq1));
-    } else if ((mask & flat) == 0xFFFFFFFF) {
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      COMBINE_LEFT_RIGHT_0TO2()
-
-      __asm__ __volatile__(
-          "sw       %[p2],      (%[sp2])    \n\t"
-          "sw       %[p1],      (%[sp1])    \n\t"
-          "sw       %[p0],      (%[sp0])    \n\t"
-          "sw       %[q0],      (%[sq0])    \n\t"
-          "sw       %[q1],      (%[sq1])    \n\t"
-          "sw       %[q2],      (%[sq2])    \n\t"
-
-          :
-          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
-            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
-            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
-    } else if ((flat != 0) && (mask != 0)) {
-      /* filtering */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    (%[sp2])    \n\t"
-            "sb     %[p1_r],    (%[sp1])    \n\t"
-            "sb     %[p0_r],    (%[sp0])    \n\t"
-            "sb     %[q0_r],    (%[sq0])    \n\t"
-            "sb     %[q1_r],    (%[sq1])    \n\t"
-            "sb     %[q2_r],    (%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  (%[sp1])    \n\t"
-            "sb         %[p0_f0],  (%[sp0])    \n\t"
-            "sb         %[q0_f0],  (%[sq0])    \n\t"
-            "sb         %[q1_f0],  (%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    +1(%[sp2])    \n\t"
-            "sb     %[p1_r],    +1(%[sp1])    \n\t"
-            "sb     %[p0_r],    +1(%[sp0])    \n\t"
-            "sb     %[q0_r],    +1(%[sq0])    \n\t"
-            "sb     %[q1_r],    +1(%[sq1])    \n\t"
-            "sb     %[q2_r],    +1(%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
-            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
-            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l],    +2(%[sp2])    \n\t"
-            "sb     %[p1_l],    +2(%[sp1])    \n\t"
-            "sb     %[p0_l],    +2(%[sp0])    \n\t"
-            "sb     %[q0_l],    +2(%[sq0])    \n\t"
-            "sb     %[q1_l],    +2(%[sq1])    \n\t"
-            "sb     %[q2_l],    +2(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l],    +3(%[sp2])    \n\t"
-            "sb     %[p1_l],    +3(%[sp1])    \n\t"
-            "sb     %[p0_l],    +3(%[sp0])    \n\t"
-            "sb     %[q0_l],    +3(%[sq0])    \n\t"
-            "sb     %[q1_l],    +3(%[sq1])    \n\t"
-            "sb     %[q2_l],    +3(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-    }
-
-    s = s + 4;
-  }
-}
-
-void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
-                              const uint8_t *blimit, const uint8_t *limit,
-                              const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask, hev, flat;
-  uint8_t *s1, *s2, *s3, *s4;
-  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
-  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
-      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
-      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  prefetch_store(s + pitch);
-
-  for (i = 0; i < 2; i++) {
-    s1 = s;
-    s2 = s + pitch;
-    s3 = s2 + pitch;
-    s4 = s3 + pitch;
-    s = s4 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p0],  -4(%[s1])    \n\t"
-        "lw     %[p1],  -4(%[s2])    \n\t"
-        "lw     %[p2],  -4(%[s3])    \n\t"
-        "lw     %[p3],  -4(%[s4])    \n\t"
-        "lw     %[q3],    (%[s1])    \n\t"
-        "lw     %[q2],    (%[s2])    \n\t"
-        "lw     %[q1],    (%[s3])    \n\t"
-        "lw     %[q0],    (%[s4])    \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    /* transpose p3, p2, p1, p0
-       original (when loaded from memory)
-       register       -4    -3   -2     -1
-         p0         p0_0  p0_1  p0_2  p0_3
-         p1         p1_0  p1_1  p1_2  p1_3
-         p2         p2_0  p2_1  p2_2  p2_3
-         p3         p3_0  p3_1  p3_2  p3_3
-
-       after transpose
-       register
-         p0         p3_3  p2_3  p1_3  p0_3
-         p1         p3_2  p2_2  p1_2  p0_2
-         p2         p3_1  p2_1  p1_1  p0_1
-         p3         p3_0  p2_0  p1_0  p0_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
-
-        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
-        "append         %[p1],      %[sec3],    16          \n\t"
-        "append         %[p3],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
-          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose q0, q1, q2, q3
-       original (when loaded from memory)
-       register       +1    +2    +3    +4
-         q3         q3_0  q3_1  q3_2  q3_3
-         q2         q2_0  q2_1  q2_2  q2_3
-         q1         q1_0  q1_1  q1_2  q1_3
-         q0         q0_0  q0_1  q0_2  q0_3
-
-       after transpose
-       register
-         q3         q0_3  q1_3  q2_3  q3_3
-         q2         q0_2  q1_2  q2_2  q3_2
-         q1         q0_1  q1_1  q2_1  q3_1
-         q0         q0_0  q1_0  q2_0  q3_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
-        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
-        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
-
-        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
-        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
-        "append         %[q2],      %[sec3],    16          \n\t"
-        "append         %[q0],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
-          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    if ((flat == 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-      STORE_F0()
-    } else if ((mask & flat) == 0xFFFFFFFF) {
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      STORE_F1()
-    } else if ((flat != 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  -3(%[s4])    \n\t"
-            "sb         %[p1_r],  -2(%[s4])    \n\t"
-            "sb         %[p0_r],  -1(%[s4])    \n\t"
-            "sb         %[q0_r],    (%[s4])    \n\t"
-            "sb         %[q1_r],  +1(%[s4])    \n\t"
-            "sb         %[q2_r],  +2(%[s4])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s4] "r"(s4));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s4])    \n\t"
-            "sb         %[p0_f0],  -1(%[s4])    \n\t"
-            "sb         %[q0_f0],    (%[s4])    \n\t"
-            "sb         %[q1_f0],  +1(%[s4])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  -3(%[s3])    \n\t"
-            "sb         %[p1_r],  -2(%[s3])    \n\t"
-            "sb         %[p0_r],  -1(%[s3])    \n\t"
-            "sb         %[q0_r],    (%[s3])    \n\t"
-            "sb         %[q1_r],  +1(%[s3])    \n\t"
-            "sb         %[q2_r],  +2(%[s3])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s3] "r"(s3));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s3])    \n\t"
-            "sb         %[p0_f0],  -1(%[s3])    \n\t"
-            "sb         %[q0_f0],    (%[s3])    \n\t"
-            "sb         %[q1_f0],  +1(%[s3])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
-            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
-            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  -3(%[s2])    \n\t"
-            "sb         %[p1_l],  -2(%[s2])    \n\t"
-            "sb         %[p0_l],  -1(%[s2])    \n\t"
-            "sb         %[q0_l],    (%[s2])    \n\t"
-            "sb         %[q1_l],  +1(%[s2])    \n\t"
-            "sb         %[q2_l],  +2(%[s2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s2] "r"(s2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s2])    \n\t"
-            "sb         %[p0_f0],  -1(%[s2])    \n\t"
-            "sb         %[q0_f0],    (%[s2])    \n\t"
-            "sb         %[q1_f0],  +1(%[s2])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  -3(%[s1])    \n\t"
-            "sb         %[p1_l],  -2(%[s1])    \n\t"
-            "sb         %[p0_l],  -1(%[s1])    \n\t"
-            "sb         %[q0_l],    (%[s1])    \n\t"
-            "sb         %[q1_l],  +1(%[s1])    \n\t"
-            "sb         %[q2_l],  +2(%[s1])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s1] "r"(s1));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s1])    \n\t"
-            "sb         %[p0_f0],  -1(%[s1])    \n\t"
-            "sb         %[q0_f0],    (%[s1])    \n\t"
-            "sb         %[q1_f0],  +1(%[s1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
-      }
-    }
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
deleted file mode 100644
index 34733e42e..000000000
--- a/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int count) {
-  uint32_t mask;
-  uint32_t hev, flat, flat2;
-  uint8_t i;
-  uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
-  uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
-  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
-  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
-  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
-  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
-  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s);
-
-  for (i = 0; i < (2 * count); i++) {
-    sp7 = s - (pitch << 3);
-    sp6 = sp7 + pitch;
-    sp5 = sp6 + pitch;
-    sp4 = sp5 + pitch;
-    sp3 = sp4 + pitch;
-    sp2 = sp3 + pitch;
-    sp1 = sp2 + pitch;
-    sp0 = sp1 + pitch;
-    sq0 = s;
-    sq1 = s + pitch;
-    sq2 = sq1 + pitch;
-    sq3 = sq2 + pitch;
-    sq4 = sq3 + pitch;
-    sq5 = sq4 + pitch;
-    sq6 = sq5 + pitch;
-    sq7 = sq6 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p7],      (%[sp7])            \n\t"
-        "lw     %[p6],      (%[sp6])            \n\t"
-        "lw     %[p5],      (%[sp5])            \n\t"
-        "lw     %[p4],      (%[sp4])            \n\t"
-        "lw     %[p3],      (%[sp3])            \n\t"
-        "lw     %[p2],      (%[sp2])            \n\t"
-        "lw     %[p1],      (%[sp1])            \n\t"
-        "lw     %[p0],      (%[sp0])            \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
-        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
-          [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
-
-    __asm__ __volatile__(
-        "lw     %[q0],      (%[sq0])            \n\t"
-        "lw     %[q1],      (%[sq1])            \n\t"
-        "lw     %[q2],      (%[sq2])            \n\t"
-        "lw     %[q3],      (%[sq3])            \n\t"
-        "lw     %[q4],      (%[sq4])            \n\t"
-        "lw     %[q5],      (%[sq5])            \n\t"
-        "lw     %[q6],      (%[sq6])            \n\t"
-        "lw     %[q7],      (%[sq7])            \n\t"
-
-        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
-          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
-        : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
-          [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
-
-    /* f0 */
-    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
-        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      __asm__ __volatile__(
-          "sw       %[p1_f0],   (%[sp1])            \n\t"
-          "sw       %[p0_f0],   (%[sp0])            \n\t"
-          "sw       %[q0_f0],   (%[sq0])            \n\t"
-          "sw       %[q1_f0],   (%[sq1])            \n\t"
-
-          :
-          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-            [sq1] "r"(sq1));
-    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
-               (mask == 0xFFFFFFFF)) {
-      /* f2 */
-      PACK_LEFT_0TO3()
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_0TO3()
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      COMBINE_LEFT_RIGHT_0TO2()
-      COMBINE_LEFT_RIGHT_3TO6()
-
-      __asm__ __volatile__(
-          "sw         %[p6], (%[sp6])    \n\t"
-          "sw         %[p5], (%[sp5])    \n\t"
-          "sw         %[p4], (%[sp4])    \n\t"
-          "sw         %[p3], (%[sp3])    \n\t"
-          "sw         %[p2], (%[sp2])    \n\t"
-          "sw         %[p1], (%[sp1])    \n\t"
-          "sw         %[p0], (%[sp0])    \n\t"
-
-          :
-          : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
-            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
-            [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
-            [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-      __asm__ __volatile__(
-          "sw         %[q6], (%[sq6])    \n\t"
-          "sw         %[q5], (%[sq5])    \n\t"
-          "sw         %[q4], (%[sq4])    \n\t"
-          "sw         %[q3], (%[sq3])    \n\t"
-          "sw         %[q2], (%[sq2])    \n\t"
-          "sw         %[q1], (%[sq1])    \n\t"
-          "sw         %[q0], (%[sq0])    \n\t"
-
-          :
-          : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
-            [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
-            [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
-            [sq1] "r"(sq1), [sq0] "r"(sq0));
-    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
-      /* f1 */
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      COMBINE_LEFT_RIGHT_0TO2()
-
-      __asm__ __volatile__(
-          "sw         %[p2], (%[sp2])    \n\t"
-          "sw         %[p1], (%[sp1])    \n\t"
-          "sw         %[p0], (%[sp0])    \n\t"
-          "sw         %[q0], (%[sq0])    \n\t"
-          "sw         %[q1], (%[sq1])    \n\t"
-          "sw         %[q2], (%[sq2])    \n\t"
-
-          :
-          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
-            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
-            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
-    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
-      /* f0+f1 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  (%[sp2])    \n\t"
-            "sb         %[p1_r],  (%[sp1])    \n\t"
-            "sb         %[p0_r],  (%[sp0])    \n\t"
-            "sb         %[q0_r],  (%[sq0])    \n\t"
-            "sb         %[q1_r],  (%[sq1])    \n\t"
-            "sb         %[q2_r],  (%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  (%[sp1])    \n\t"
-            "sb         %[p0_f0],  (%[sp0])    \n\t"
-            "sb         %[q0_f0],  (%[sq0])    \n\t"
-            "sb         %[q1_f0],  (%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  +1(%[sp2])    \n\t"
-            "sb         %[p1_r],  +1(%[sp1])    \n\t"
-            "sb         %[p0_r],  +1(%[sp0])    \n\t"
-            "sb         %[q0_r],  +1(%[sq0])    \n\t"
-            "sb         %[q1_r],  +1(%[sq1])    \n\t"
-            "sb         %[q2_r],  +1(%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  +2(%[sp2])    \n\t"
-            "sb         %[p1_l],  +2(%[sp1])    \n\t"
-            "sb         %[p0_l],  +2(%[sp0])    \n\t"
-            "sb         %[q0_l],  +2(%[sq0])    \n\t"
-            "sb         %[q1_l],  +2(%[sq1])    \n\t"
-            "sb         %[q2_l],  +2(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  +3(%[sp2])    \n\t"
-            "sb         %[p1_l],  +3(%[sp1])    \n\t"
-            "sb         %[p0_l],  +3(%[sp0])    \n\t"
-            "sb         %[q0_l],  +3(%[sq0])    \n\t"
-            "sb         %[q1_l],  +3(%[sq1])    \n\t"
-            "sb         %[q2_l],  +3(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
-      /* f0 + f1 + f2 */
-      /* f0  function */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* f1  function */
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
-                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
-                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
-
-      /* f2  function */
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      if (mask & flat & flat2 & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p6_r],  (%[sp6])    \n\t"
-            "sb         %[p5_r],  (%[sp5])    \n\t"
-            "sb         %[p4_r],  (%[sp4])    \n\t"
-            "sb         %[p3_r],  (%[sp3])    \n\t"
-            "sb         %[p2_r],  (%[sp2])    \n\t"
-            "sb         %[p1_r],  (%[sp1])    \n\t"
-            "sb         %[p0_r],  (%[sp0])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb         %[q0_r],  (%[sq0])    \n\t"
-            "sb         %[q1_r],  (%[sq1])    \n\t"
-            "sb         %[q2_r],  (%[sq2])    \n\t"
-            "sb         %[q3_r],  (%[sq3])    \n\t"
-            "sb         %[q4_r],  (%[sq4])    \n\t"
-            "sb         %[q5_r],  (%[sq5])    \n\t"
-            "sb         %[q6_r],  (%[sq6])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
-              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
-      } else if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
-            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
-            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
-            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
-            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
-            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  (%[sp1])    \n\t"
-            "sb         %[p0_f0],  (%[sp0])    \n\t"
-            "sb         %[q0_f0],  (%[sq0])    \n\t"
-            "sb         %[q1_f0],  (%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl        %[p6_r], %[p6_r], 16     \n\t"
-          "srl        %[p5_r], %[p5_r], 16     \n\t"
-          "srl        %[p4_r], %[p4_r], 16     \n\t"
-          "srl        %[p3_r], %[p3_r], 16     \n\t"
-          "srl        %[p2_r], %[p2_r], 16     \n\t"
-          "srl        %[p1_r], %[p1_r], 16     \n\t"
-          "srl        %[p0_r], %[p0_r], 16     \n\t"
-          "srl        %[q0_r], %[q0_r], 16     \n\t"
-          "srl        %[q1_r], %[q1_r], 16     \n\t"
-          "srl        %[q2_r], %[q2_r], 16     \n\t"
-          "srl        %[q3_r], %[q3_r], 16     \n\t"
-          "srl        %[q4_r], %[q4_r], 16     \n\t"
-          "srl        %[q5_r], %[q5_r], 16     \n\t"
-          "srl        %[q6_r], %[q6_r], 16     \n\t"
-
-          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
-            [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
-            [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
-            [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
-          :);
-
-      __asm__ __volatile__(
-          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
-          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
-          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
-          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
-          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
-          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
-          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
-          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
-          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
-          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
-
-          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
-            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
-            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p6_r],  +1(%[sp6])    \n\t"
-            "sb         %[p5_r],  +1(%[sp5])    \n\t"
-            "sb         %[p4_r],  +1(%[sp4])    \n\t"
-            "sb         %[p3_r],  +1(%[sp3])    \n\t"
-            "sb         %[p2_r],  +1(%[sp2])    \n\t"
-            "sb         %[p1_r],  +1(%[sp1])    \n\t"
-            "sb         %[p0_r],  +1(%[sp0])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
-              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb         %[q0_r],  +1(%[sq0])    \n\t"
-            "sb         %[q1_r],  +1(%[sq1])    \n\t"
-            "sb         %[q2_r],  +1(%[sq2])    \n\t"
-            "sb         %[q3_r],  +1(%[sq3])    \n\t"
-            "sb         %[q4_r],  +1(%[sq4])    \n\t"
-            "sb         %[q5_r],  +1(%[sq5])    \n\t"
-            "sb         %[q6_r],  +1(%[sq6])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
-              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
-      } else if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
-            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
-            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
-            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
-            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
-            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
-          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
-          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
-          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p6_l],  +2(%[sp6])    \n\t"
-            "sb         %[p5_l],  +2(%[sp5])    \n\t"
-            "sb         %[p4_l],  +2(%[sp4])    \n\t"
-            "sb         %[p3_l],  +2(%[sp3])    \n\t"
-            "sb         %[p2_l],  +2(%[sp2])    \n\t"
-            "sb         %[p1_l],  +2(%[sp1])    \n\t"
-            "sb         %[p0_l],  +2(%[sp0])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
-              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb         %[q0_l],  +2(%[sq0])    \n\t"
-            "sb         %[q1_l],  +2(%[sq1])    \n\t"
-            "sb         %[q2_l],  +2(%[sq2])    \n\t"
-            "sb         %[q3_l],  +2(%[sq3])    \n\t"
-            "sb         %[q4_l],  +2(%[sq4])    \n\t"
-            "sb         %[q5_l],  +2(%[sq5])    \n\t"
-            "sb         %[q6_l],  +2(%[sq6])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
-              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
-      } else if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
-            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
-            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
-            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
-            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
-            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p6_l],    %[p6_l],    16   \n\t"
-          "srl      %[p5_l],    %[p5_l],    16   \n\t"
-          "srl      %[p4_l],    %[p4_l],    16   \n\t"
-          "srl      %[p3_l],    %[p3_l],    16   \n\t"
-          "srl      %[p2_l],    %[p2_l],    16   \n\t"
-          "srl      %[p1_l],    %[p1_l],    16   \n\t"
-          "srl      %[p0_l],    %[p0_l],    16   \n\t"
-          "srl      %[q0_l],    %[q0_l],    16   \n\t"
-          "srl      %[q1_l],    %[q1_l],    16   \n\t"
-          "srl      %[q2_l],    %[q2_l],    16   \n\t"
-          "srl      %[q3_l],    %[q3_l],    16   \n\t"
-          "srl      %[q4_l],    %[q4_l],    16   \n\t"
-          "srl      %[q5_l],    %[q5_l],    16   \n\t"
-          "srl      %[q6_l],    %[q6_l],    16   \n\t"
-
-          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
-            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
-            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
-            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
-          :);
-
-      __asm__ __volatile__(
-          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
-          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
-          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
-          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
-          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
-          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
-          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
-          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
-          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
-          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
-
-          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
-            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
-            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p6_l],    +3(%[sp6])    \n\t"
-            "sb     %[p5_l],    +3(%[sp5])    \n\t"
-            "sb     %[p4_l],    +3(%[sp4])    \n\t"
-            "sb     %[p3_l],    +3(%[sp3])    \n\t"
-            "sb     %[p2_l],    +3(%[sp2])    \n\t"
-            "sb     %[p1_l],    +3(%[sp1])    \n\t"
-            "sb     %[p0_l],    +3(%[sp0])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
-              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb     %[q0_l],    +3(%[sq0])    \n\t"
-            "sb     %[q1_l],    +3(%[sq1])    \n\t"
-            "sb     %[q2_l],    +3(%[sq2])    \n\t"
-            "sb     %[q3_l],    +3(%[sq3])    \n\t"
-            "sb     %[q4_l],    +3(%[sq4])    \n\t"
-            "sb     %[q5_l],    +3(%[sq5])    \n\t"
-            "sb     %[q6_l],    +3(%[sq6])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
-              [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
-      } else if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
-            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
-            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
-            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
-            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
-            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-    }
-
-    s = s + 4;
-  }
-}
-
-void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
-                                 const uint8_t *blimit, const uint8_t *limit,
-                                 const uint8_t *thresh) {
-  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
-}
-
-void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
-                                      const uint8_t *blimit,
-                                      const uint8_t *limit,
-                                      const uint8_t *thresh) {
-  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
deleted file mode 100644
index 3d3f1ec97..000000000
--- a/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ /dev/null
@@ -1,758 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask, hev, flat, flat2;
-  uint8_t *s1, *s2, *s3, *s4;
-  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
-  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
-  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
-  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
-  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
-  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
-      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
-      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  prefetch_store(s + pitch);
-
-  for (i = 0; i < 2; i++) {
-    s1 = s;
-    s2 = s + pitch;
-    s3 = s2 + pitch;
-    s4 = s3 + pitch;
-    s = s4 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p0],  -4(%[s1])    \n\t"
-        "lw     %[p1],  -4(%[s2])    \n\t"
-        "lw     %[p2],  -4(%[s3])    \n\t"
-        "lw     %[p3],  -4(%[s4])    \n\t"
-        "lw     %[p4],  -8(%[s1])    \n\t"
-        "lw     %[p5],  -8(%[s2])    \n\t"
-        "lw     %[p6],  -8(%[s3])    \n\t"
-        "lw     %[p7],  -8(%[s4])    \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    __asm__ __volatile__(
-        "lw     %[q3],  (%[s1])     \n\t"
-        "lw     %[q2],  (%[s2])     \n\t"
-        "lw     %[q1],  (%[s3])     \n\t"
-        "lw     %[q0],  (%[s4])     \n\t"
-        "lw     %[q7],  +4(%[s1])   \n\t"
-        "lw     %[q6],  +4(%[s2])   \n\t"
-        "lw     %[q5],  +4(%[s3])   \n\t"
-        "lw     %[q4],  +4(%[s4])   \n\t"
-
-        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
-          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    /* transpose p3, p2, p1, p0
-       original (when loaded from memory)
-       register       -4    -3   -2     -1
-         p0         p0_0  p0_1  p0_2  p0_3
-         p1         p1_0  p1_1  p1_2  p1_3
-         p2         p2_0  p2_1  p2_2  p2_3
-         p3         p3_0  p3_1  p3_2  p3_3
-
-       after transpose
-       register
-         p0         p3_3  p2_3  p1_3  p0_3
-         p1         p3_2  p2_2  p1_2  p0_2
-         p2         p3_1  p2_1  p1_1  p0_1
-         p3         p3_0  p2_0  p1_0  p0_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
-
-        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
-        "append         %[p1],      %[sec3],    16          \n\t"
-        "append         %[p3],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
-          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose q0, q1, q2, q3
-       original (when loaded from memory)
-       register       +1    +2    +3    +4
-         q3         q3_0  q3_1  q3_2  q3_3
-         q2         q2_0  q2_1  q2_2  q2_3
-         q1         q1_0  q1_1  q1_2  q1_3
-         q0         q0_0  q0_1  q0_2  q0_3
-
-       after transpose
-       register
-         q3         q0_3  q1_3  q2_3  q3_3
-         q2         q0_2  q1_2  q2_2  q3_2
-         q1         q0_1  q1_1  q2_1  q3_1
-         q0         q0_0  q1_0  q2_0  q3_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
-        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
-        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
-
-        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
-        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
-        "append         %[q2],      %[sec3],    16          \n\t"
-        "append         %[q0],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
-          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose p7, p6, p5, p4
-       original (when loaded from memory)
-       register      -8    -7   -6     -5
-         p4         p4_0  p4_1  p4_2  p4_3
-         p5         p5_0  p5_1  p5_2  p5_3
-         p6         p6_0  p6_1  p6_2  p6_3
-         p7         p7_0  p7_1  p7_2  p7_3
-
-       after transpose
-       register
-         p4         p7_3  p6_3  p5_3  p4_3
-         p5         p7_2  p6_2  p5_2  p4_2
-         p6         p7_1  p6_1  p5_1  p4_1
-         p7         p7_0  p6_0  p5_0  p4_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
-
-        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
-        "append         %[p5],      %[sec3],    16          \n\t"
-        "append         %[p7],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
-          [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose q4, q5, q6, q7
-       original (when loaded from memory)
-       register      +5    +6    +7    +8
-         q7         q7_0  q7_1  q7_2  q7_3
-         q6         q6_0  q6_1  q6_2  q6_3
-         q5         q5_0  q5_1  q5_2  q5_3
-         q4         q4_0  q4_1  q4_2  q4_3
-
-       after transpose
-       register
-         q7         q4_3  q5_3  q26_3  q7_3
-         q6         q4_2  q5_2  q26_2  q7_2
-         q5         q4_1  q5_1  q26_1  q7_1
-         q4         q4_0  q5_0  q26_0  q7_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
-        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
-        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
-
-        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
-        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
-        "append         %[q6],      %[sec3],    16          \n\t"
-        "append         %[q4],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
-          [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
-
-    /* f0 */
-    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
-        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-      STORE_F0()
-    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
-               (mask == 0xFFFFFFFF)) {
-      /* f2 */
-      PACK_LEFT_0TO3()
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_0TO3()
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      STORE_F2()
-    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
-      /* f1 */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      STORE_F1()
-    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
-      /* f0 + f1 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    -3(%[s4])    \n\t"
-            "sb     %[p1_r],    -2(%[s4])    \n\t"
-            "sb     %[p0_r],    -1(%[s4])    \n\t"
-            "sb     %[q0_r],      (%[s4])    \n\t"
-            "sb     %[q1_r],    +1(%[s4])    \n\t"
-            "sb     %[q2_r],    +2(%[s4])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s4] "r"(s4));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s4])    \n\t"
-            "sb         %[p0_f0],  -1(%[s4])    \n\t"
-            "sb         %[q0_f0],    (%[s4])    \n\t"
-            "sb         %[q1_f0],  +1(%[s4])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    -3(%[s3])    \n\t"
-            "sb     %[p1_r],    -2(%[s3])    \n\t"
-            "sb     %[p0_r],    -1(%[s3])    \n\t"
-            "sb     %[q0_r],      (%[s3])    \n\t"
-            "sb     %[q1_r],    +1(%[s3])    \n\t"
-            "sb     %[q2_r],    +2(%[s3])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s3] "r"(s3));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s3])    \n\t"
-            "sb     %[p0_f0],   -1(%[s3])    \n\t"
-            "sb     %[q0_f0],     (%[s3])    \n\t"
-            "sb     %[q1_f0],   +1(%[s3])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb       %[p2_l],    -3(%[s2])    \n\t"
-            "sb       %[p1_l],    -2(%[s2])    \n\t"
-            "sb       %[p0_l],    -1(%[s2])    \n\t"
-            "sb       %[q0_l],      (%[s2])    \n\t"
-            "sb       %[q1_l],    +1(%[s2])    \n\t"
-            "sb       %[q2_l],    +2(%[s2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s2] "r"(s2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s2])    \n\t"
-            "sb     %[p0_f0],   -1(%[s2])    \n\t"
-            "sb     %[q0_f0],     (%[s2])    \n\t"
-            "sb     %[q1_f0],   +1(%[s2])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l],    -3(%[s1])    \n\t"
-            "sb     %[p1_l],    -2(%[s1])    \n\t"
-            "sb     %[p0_l],    -1(%[s1])    \n\t"
-            "sb     %[q0_l],      (%[s1])    \n\t"
-            "sb     %[q1_l],    +1(%[s1])    \n\t"
-            "sb     %[q2_l],    +2(%[s1])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s1] "r"(s1));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s1])    \n\t"
-            "sb     %[p0_f0],   -1(%[s1])    \n\t"
-            "sb     %[q0_f0],     (%[s1])    \n\t"
-            "sb     %[q1_f0],   +1(%[s1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
-      }
-    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
-      /* f0+f1+f2 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      PACK_LEFT_0TO3()
-      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
-                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
-
-      PACK_RIGHT_0TO3()
-      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
-                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
-
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      if (mask & flat & flat2 & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p6_r],    -7(%[s4])    \n\t"
-            "sb     %[p5_r],    -6(%[s4])    \n\t"
-            "sb     %[p4_r],    -5(%[s4])    \n\t"
-            "sb     %[p3_r],    -4(%[s4])    \n\t"
-            "sb     %[p2_r],    -3(%[s4])    \n\t"
-            "sb     %[p1_r],    -2(%[s4])    \n\t"
-            "sb     %[p0_r],    -1(%[s4])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [p0_r] "r"(p0_r), [s4] "r"(s4));
-
-        __asm__ __volatile__(
-            "sb     %[q0_r],      (%[s4])    \n\t"
-            "sb     %[q1_r],    +1(%[s4])    \n\t"
-            "sb     %[q2_r],    +2(%[s4])    \n\t"
-            "sb     %[q3_r],    +3(%[s4])    \n\t"
-            "sb     %[q4_r],    +4(%[s4])    \n\t"
-            "sb     %[q5_r],    +5(%[s4])    \n\t"
-            "sb     %[q6_r],    +6(%[s4])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [s4] "r"(s4));
-      } else if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
-            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
-            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
-            "sb     %[q0_r_f1],       (%[s4])    \n\t"
-            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
-            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s4])    \n\t"
-            "sb     %[p0_f0],   -1(%[s4])    \n\t"
-            "sb     %[q0_f0],     (%[s4])    \n\t"
-            "sb     %[q1_f0],   +1(%[s4])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p6_r],        %[p6_r],        16     \n\t"
-          "srl      %[p5_r],        %[p5_r],        16     \n\t"
-          "srl      %[p4_r],        %[p4_r],        16     \n\t"
-          "srl      %[p3_r],        %[p3_r],        16     \n\t"
-          "srl      %[p2_r],        %[p2_r],        16     \n\t"
-          "srl      %[p1_r],        %[p1_r],        16     \n\t"
-          "srl      %[p0_r],        %[p0_r],        16     \n\t"
-          "srl      %[q0_r],        %[q0_r],        16     \n\t"
-          "srl      %[q1_r],        %[q1_r],        16     \n\t"
-          "srl      %[q2_r],        %[q2_r],        16     \n\t"
-          "srl      %[q3_r],        %[q3_r],        16     \n\t"
-          "srl      %[q4_r],        %[q4_r],        16     \n\t"
-          "srl      %[q5_r],        %[q5_r],        16     \n\t"
-          "srl      %[q6_r],        %[q6_r],        16     \n\t"
-
-          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
-            [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
-            [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
-            [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
-          :);
-
-      __asm__ __volatile__(
-          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
-          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
-          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
-          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
-          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
-          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
-          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
-          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
-          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
-          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
-
-          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
-            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
-            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p6_r],    -7(%[s3])    \n\t"
-            "sb     %[p5_r],    -6(%[s3])    \n\t"
-            "sb     %[p4_r],    -5(%[s3])    \n\t"
-            "sb     %[p3_r],    -4(%[s3])    \n\t"
-            "sb     %[p2_r],    -3(%[s3])    \n\t"
-            "sb     %[p1_r],    -2(%[s3])    \n\t"
-            "sb     %[p0_r],    -1(%[s3])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [p0_r] "r"(p0_r), [s3] "r"(s3));
-
-        __asm__ __volatile__(
-            "sb     %[q0_r],      (%[s3])    \n\t"
-            "sb     %[q1_r],    +1(%[s3])    \n\t"
-            "sb     %[q2_r],    +2(%[s3])    \n\t"
-            "sb     %[q3_r],    +3(%[s3])    \n\t"
-            "sb     %[q4_r],    +4(%[s3])    \n\t"
-            "sb     %[q5_r],    +5(%[s3])    \n\t"
-            "sb     %[q6_r],    +6(%[s3])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [s3] "r"(s3));
-      } else if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
-            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
-            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
-            "sb     %[q0_r_f1],       (%[s3])    \n\t"
-            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
-            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s3])    \n\t"
-            "sb     %[p0_f0],   -1(%[s3])    \n\t"
-            "sb     %[q0_f0],     (%[s3])    \n\t"
-            "sb     %[q1_f0],   +1(%[s3])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p6_l],    -7(%[s2])    \n\t"
-            "sb     %[p5_l],    -6(%[s2])    \n\t"
-            "sb     %[p4_l],    -5(%[s2])    \n\t"
-            "sb     %[p3_l],    -4(%[s2])    \n\t"
-            "sb     %[p2_l],    -3(%[s2])    \n\t"
-            "sb     %[p1_l],    -2(%[s2])    \n\t"
-            "sb     %[p0_l],    -1(%[s2])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [s2] "r"(s2));
-
-        __asm__ __volatile__(
-            "sb     %[q0_l],      (%[s2])    \n\t"
-            "sb     %[q1_l],    +1(%[s2])    \n\t"
-            "sb     %[q2_l],    +2(%[s2])    \n\t"
-            "sb     %[q3_l],    +3(%[s2])    \n\t"
-            "sb     %[q4_l],    +4(%[s2])    \n\t"
-            "sb     %[q5_l],    +5(%[s2])    \n\t"
-            "sb     %[q6_l],    +6(%[s2])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [q6_l] "r"(q6_l), [s2] "r"(s2));
-      } else if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
-            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
-            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
-            "sb     %[q0_l_f1],       (%[s2])    \n\t"
-            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
-            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s2])    \n\t"
-            "sb     %[p0_f0],   -1(%[s2])    \n\t"
-            "sb     %[q0_f0],     (%[s2])    \n\t"
-            "sb     %[q1_f0],   +1(%[s2])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p6_l],        %[p6_l],        16     \n\t"
-          "srl      %[p5_l],        %[p5_l],        16     \n\t"
-          "srl      %[p4_l],        %[p4_l],        16     \n\t"
-          "srl      %[p3_l],        %[p3_l],        16     \n\t"
-          "srl      %[p2_l],        %[p2_l],        16     \n\t"
-          "srl      %[p1_l],        %[p1_l],        16     \n\t"
-          "srl      %[p0_l],        %[p0_l],        16     \n\t"
-          "srl      %[q0_l],        %[q0_l],        16     \n\t"
-          "srl      %[q1_l],        %[q1_l],        16     \n\t"
-          "srl      %[q2_l],        %[q2_l],        16     \n\t"
-          "srl      %[q3_l],        %[q3_l],        16     \n\t"
-          "srl      %[q4_l],        %[q4_l],        16     \n\t"
-          "srl      %[q5_l],        %[q5_l],        16     \n\t"
-          "srl      %[q6_l],        %[q6_l],        16     \n\t"
-
-          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
-            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
-            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
-            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
-          :);
-
-      __asm__ __volatile__(
-          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
-          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
-          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
-          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
-          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
-          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
-          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
-          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
-          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
-          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
-
-          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
-            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
-            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p6_l],    -7(%[s1])    \n\t"
-            "sb     %[p5_l],    -6(%[s1])    \n\t"
-            "sb     %[p4_l],    -5(%[s1])    \n\t"
-            "sb     %[p3_l],    -4(%[s1])    \n\t"
-            "sb     %[p2_l],    -3(%[s1])    \n\t"
-            "sb     %[p1_l],    -2(%[s1])    \n\t"
-            "sb     %[p0_l],    -1(%[s1])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [s1] "r"(s1));
-
-        __asm__ __volatile__(
-            "sb     %[q0_l],     (%[s1])    \n\t"
-            "sb     %[q1_l],    1(%[s1])    \n\t"
-            "sb     %[q2_l],    2(%[s1])    \n\t"
-            "sb     %[q3_l],    3(%[s1])    \n\t"
-            "sb     %[q4_l],    4(%[s1])    \n\t"
-            "sb     %[q5_l],    5(%[s1])    \n\t"
-            "sb     %[q6_l],    6(%[s1])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [q6_l] "r"(q6_l), [s1] "r"(s1));
-      } else if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
-            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
-            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
-            "sb     %[q0_l_f1],       (%[s1])    \n\t"
-            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
-            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s1])    \n\t"
-            "sb     %[p0_f0],   -1(%[s1])    \n\t"
-            "sb     %[q0_f0],     (%[s1])    \n\t"
-            "sb     %[q1_f0],   +1(%[s1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
-      }
-    }
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/aom_dsp/mips/loopfilter_msa.h b/aom_dsp/mips/loopfilter_msa.h
deleted file mode 100644
index 54b0bb4bd..000000000
--- a/aom_dsp/mips/loopfilter_msa.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-    filt = filt & (v16i8)hev_in;                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    /* combine left and right part */                                   \
-    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
-                                                                        \
-    filt = filt & (v16i8)mask_in;                                       \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
-  }
-
-#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-                                                                        \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
-    filt_l += q0_sub_p0_l;                                              \
-    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
-                                                                        \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
-    filt = filt & (v16i8)mask_in;                                       \
-                                                                        \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
-  }
-
-#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
-  {                                                                      \
-    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
-    v16u8 zero_in = { 0 };                                               \
-                                                                         \
-    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
-    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
-    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
-    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
-    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
-                                                                         \
-    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
-    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
-    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
-    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
-                                                                         \
-    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
-    flat_out = __msa_xori_b(flat_out, 0xff);                             \
-    flat_out = flat_out & (mask);                                        \
-  }
-
-#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
-                  q6_in, q7_in, flat_in, flat2_out)                       \
-  {                                                                       \
-    v16u8 tmp_flat5, zero_in = { 0 };                                     \
-    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
-    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
-                                                                          \
-    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
-    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
-    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
-    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
-    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
-    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
-    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
-    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
-    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
-                                                                          \
-    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
-    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
-    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
-    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
-    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
-    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
-    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
-                                                                          \
-    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
-    flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
-    flat2_out = flat2_out & flat_in;                                      \
-  }
-
-#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
-                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
-                    q1_filt8_out, q2_filt8_out)                             \
-  {                                                                         \
-    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
-                                                                            \
-    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
-    tmp_filt8_0 = p3_in << 1;                                               \
-                                                                            \
-    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
-    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
-    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
-    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
-    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
-    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
-    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
-    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
-                                                                            \
-    tmp_filt8_0 = q2_in + q3_in;                                            \
-    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
-    tmp_filt8_1 = q3_in + q3_in;                                            \
-    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
-    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
-    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
-    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
-    tmp_filt8_0 = q1_in + q3_in;                                            \
-    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
-    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-  }
-
-#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
-                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
-                     flat_out)                                               \
-  {                                                                          \
-    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
-    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
-                                                                             \
-    /* absolute subtraction of pixel values */                               \
-    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
-    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
-    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
-    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
-    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
-    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
-    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
-    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
-                                                                             \
-    /* calculation of hev */                                                 \
-    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
-    hev_out = thresh_in < (v16u8)flat_out;                                   \
-                                                                             \
-    /* calculation of mask */                                                \
-    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
-    p1_asub_q1_m >>= 1;                                                      \
-    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
-                                                                             \
-    mask_out = b_limit_in < p0_asub_q0_m;                                    \
-    mask_out = __msa_max_u_b(flat_out, mask_out);                            \
-    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
-    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
-    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
-    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
-                                                                             \
-    mask_out = limit_in < (v16u8)mask_out;                                   \
-    mask_out = __msa_xori_b(mask_out, 0xff);                                 \
-  }
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/aom_dsp/mips/macros_msa.h b/aom_dsp/mips/macros_msa.h
deleted file mode 100644
index 9bfc27147..000000000
--- a/aom_dsp/mips/macros_msa.h
+++ /dev/null
@@ -1,2058 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
-#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
-
-#include <msa.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
-
-#if (__mips_isa_rev >= 6)
-#define LH(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint16_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
-  })
-
-#define LW(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint32_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
-  })
-
-#if (__mips == 64)
-#define LD(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint64_t val_m = 0;                                   \
-                                                          \
-    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
-  })
-#else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
-  })
-#endif  // (__mips == 64)
-
-#define SH(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint16_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SW(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint32_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SD(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint64_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-#else  // !(__mips_isa_rev >= 6)
-#define LH(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint16_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
-  })
-
-#define LW(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint32_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
-  })
-
-#if (__mips == 64)
-#define LD(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint64_t val_m = 0;                                    \
-                                                           \
-    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
-  })
-#else  // !(__mips == 64)
-#define LD(psrc)                                                              \
-  ({                                                                          \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
-    uint32_t val0_m, val1_m;                                                  \
-    uint64_t val_m_combined = 0;                                              \
-                                                                              \
-    val0_m = LW(psrc_m1);                                                     \
-    val1_m = LW(psrc_m1 + 4);                                                 \
-                                                                              \
-    val_m_combined = (uint64_t)(val1_m);                                      \
-    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
-    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
-                                                                              \
-    val_m_combined;                                                           \
-  })
-#endif  // (__mips == 64)
-
-#define SH(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint16_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
-  }
-
-#define SW(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint32_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
-  }
-
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
-  }
-#endif  // (__mips_isa_rev >= 6)
-
-/* Description : Load 4 words with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1, out2, out3
-   Details     : Load word in 'out0' from (psrc)
-                 Load word in 'out1' from (psrc + stride)
-                 Load word in 'out2' from (psrc + 2 * stride)
-                 Load word in 'out3' from (psrc + 3 * stride)
-*/
-#define LW4(psrc, stride, out0, out1, out2, out3) \
-  {                                               \
-    out0 = LW((psrc));                            \
-    out1 = LW((psrc) + stride);                   \
-    out2 = LW((psrc) + 2 * stride);               \
-    out3 = LW((psrc) + 3 * stride);               \
-  }
-
-/* Description : Load double words with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load double word in 'out0' from (psrc)
-                 Load double word in 'out1' from (psrc + stride)
-*/
-#define LD2(psrc, stride, out0, out1) \
-  {                                   \
-    out0 = LD((psrc));                \
-    out1 = LD((psrc) + stride);       \
-  }
-#define LD4(psrc, stride, out0, out1, out2, out3) \
-  {                                               \
-    LD2((psrc), stride, out0, out1);              \
-    LD2((psrc) + 2 * stride, stride, out2, out3); \
-  }
-
-/* Description : Store 4 words with stride
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Store word from 'in0' to (pdst)
-                 Store word from 'in1' to (pdst + stride)
-                 Store word from 'in2' to (pdst + 2 * stride)
-                 Store word from 'in3' to (pdst + 3 * stride)
-*/
-#define SW4(in0, in1, in2, in3, pdst, stride) \
-  {                                           \
-    SW(in0, (pdst))                           \
-    SW(in1, (pdst) + stride);                 \
-    SW(in2, (pdst) + 2 * stride);             \
-    SW(in3, (pdst) + 3 * stride);             \
-  }
-
-/* Description : Store 4 double words with stride
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Store double word from 'in0' to (pdst)
-                 Store double word from 'in1' to (pdst + stride)
-                 Store double word from 'in2' to (pdst + 2 * stride)
-                 Store double word from 'in3' to (pdst + 3 * stride)
-*/
-#define SD4(in0, in1, in2, in3, pdst, stride) \
-  {                                           \
-    SD(in0, (pdst))                           \
-    SD(in1, (pdst) + stride);                 \
-    SD(in2, (pdst) + 2 * stride);             \
-    SD(in3, (pdst) + 3 * stride);             \
-  }
-
-/* Description : Load vectors with 16 byte elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Load 16 byte elements in 'out0' from (psrc)
-                 Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_B(RTYPE, (psrc));                \
-    out1 = LD_B(RTYPE, (psrc) + stride);       \
-  }
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
-
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
-  {                                                  \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
-    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
-  }
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
-
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
-
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
-  {                                                              \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
-    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
-  }
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
-
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
-  {                                                                          \
-    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
-    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
-  }
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
-
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_H(RTYPE, (psrc));                \
-    out1 = LD_H(RTYPE, (psrc) + (stride));     \
-  }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
-               out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
-  {                                                                            \
-    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
-          out7);                                                               \
-    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
-          out13, out14, out15);                                                \
-  }
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
-
-/* Description : Load 4x4 block of signed halfword elements from 1D source
-                 data into 4 vectors (Each vector with 4 signed halfwords)
-   Arguments   : Input   - psrc
-                 Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3)            \
-  {                                                       \
-    out0 = LD_SH(psrc);                                   \
-    out2 = LD_SH(psrc + 8);                               \
-    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
-    out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
-  }
-
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
-  {                                      \
-    out0 = LD_SW((psrc));                \
-    out1 = LD_SW((psrc) + stride);       \
-  }
-
-/* Description : Store vectors of 16 byte elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 16 byte elements from 'in0' to (pdst)
-                 Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_B(RTYPE, in0, (pdst));                \
-    ST_B(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
-    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_H(RTYPE, in0, (pdst));                \
-    ST_H(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
-    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 4 word elements from 'in0' to (pdst)
-                 Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
-  {                                    \
-    ST_SW(in0, (pdst));                \
-    ST_SW(in1, (pdst) + stride);       \
-  }
-
-/* Description : Store 2x4 byte block to destination memory from input vector
-   Arguments   : Inputs - in, stidx, pdst, stride
-   Details     : Index 'stidx' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst)
-                 Index 'stidx+1' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + stride)
-                 Index 'stidx+2' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + 2 * stride)
-                 Index 'stidx+3' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + 3 * stride)
-*/
-#define ST2x4_UB(in, stidx, pdst, stride)            \
-  {                                                  \
-    uint16_t out0_m, out1_m, out2_m, out3_m;         \
-    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
-                                                     \
-    out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
-    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
-    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
-    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
-                                                     \
-    SH(out0_m, pblk_2x4_m);                          \
-    SH(out1_m, pblk_2x4_m + stride);                 \
-    SH(out2_m, pblk_2x4_m + 2 * stride);             \
-    SH(out3_m, pblk_2x4_m + 3 * stride);             \
-  }
-
-/* Description : Store 4x2 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst, stride
-   Details     : Index 0 word element from 'in' vector is copied to the GP
-                 register and stored to (pdst)
-                 Index 1 word element from 'in' vector is copied to the GP
-                 register and stored to (pdst + stride)
-*/
-#define ST4x2_UB(in, pdst, stride)           \
-  {                                          \
-    uint32_t out0_m, out1_m;                 \
-    uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
-                                             \
-    out0_m = __msa_copy_u_w((v4i32)in, 0);   \
-    out1_m = __msa_copy_u_w((v4i32)in, 1);   \
-                                             \
-    SW(out0_m, pblk_4x2_m);                  \
-    SW(out1_m, pblk_4x2_m + stride);         \
-  }
-
-/* Description : Store 4x4 byte block to destination memory from input vector
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : 'Idx0' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst)
-                 'Idx1' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + stride)
-                 'Idx2' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + 2 * stride)
-                 'Idx3' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
-  {                                                              \
-    uint32_t out0_m, out1_m, out2_m, out3_m;                     \
-    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
-                                                                 \
-    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
-    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
-    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
-    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
-                                                                 \
-    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
-  }
-#define ST4x8_UB(in0, in1, pdst, stride)                           \
-  {                                                                \
-    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
-                                                                   \
-    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
-    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
-  }
-
-/* Description : Store 8x1 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst
-   Details     : Index 0 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst)
-*/
-#define ST8x1_UB(in, pdst)                 \
-  {                                        \
-    uint64_t out0_m;                       \
-                                           \
-    out0_m = __msa_copy_u_d((v2i64)in, 0); \
-    SD(out0_m, pdst);                      \
-  }
-
-/* Description : Store 8x2 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst, stride
-   Details     : Index 0 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst + stride)
-*/
-#define ST8x2_UB(in, pdst, stride)           \
-  {                                          \
-    uint64_t out0_m, out1_m;                 \
-    uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
-                                             \
-    out0_m = __msa_copy_u_d((v2i64)in, 0);   \
-    out1_m = __msa_copy_u_d((v2i64)in, 1);   \
-                                             \
-    SD(out0_m, pblk_8x2_m);                  \
-    SD(out1_m, pblk_8x2_m + stride);         \
-  }
-
-/* Description : Store 8x4 byte block to destination memory from input
-                 vectors
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Index 0 double word element from 'in0' vector is copied to the
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in0' vector is copied to the
-                 GP register and stored to (pdst + stride)
-                 Index 0 double word element from 'in1' vector is copied to the
-                 GP register and stored to (pdst + 2 * stride)
-                 Index 1 double word element from 'in1' vector is copied to the
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST8x4_UB(in0, in1, pdst, stride)                     \
-  {                                                          \
-    uint64_t out0_m, out1_m, out2_m, out3_m;                 \
-    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
-                                                             \
-    out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
-    out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
-    out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
-    out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
-                                                             \
-    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
-  }
-
-/* Description : average with rounding (in0 + in1 + 1) / 2.
-   Arguments   : Inputs  - in0, in1, in2, in3,
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from 'in0' vector is added with
-                 each unsigned byte element from 'in1' vector. Then the average
-                 with rounding is calculated and written to 'out0'
-*/
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
-  {                                                       \
-    out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
-    out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
-  }
-#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
-
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
-    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
-  }
-#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide with zero
-   Arguments   : Inputs  - in0, in1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
-                 value specified in the 'slide_val'
-*/
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
-  {                                                                   \
-    v16i8 zero_m = { 0 };                                             \
-    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
-    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
-  }
-#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
-
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
-                  slide_val)                                         \
-  {                                                                  \
-    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
-    SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
-  }
-#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide
-   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
-                 value specified in the 'slide_val'
-*/
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
-  {                                                                       \
-    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
-    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
-  }
-#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
-#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
-
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
-                out2, slide_val)                                             \
-  {                                                                          \
-    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
-    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
-  }
-#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
-#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
-
-/* Description : Shuffle byte vector elements as per mask vector
-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
-                 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
-  {                                                                   \
-    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
-    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
-  }
-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
-#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
-#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
-
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
-                out3)                                                          \
-  {                                                                            \
-    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
-    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
-  }
-#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
-#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Unsigned byte elements from 'mult0' are multiplied with
-                 unsigned byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. unsigned halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
-  }
-#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
-
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                 cnst3, out0, out1, out2, out3)                          \
-  {                                                                      \
-    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
-  }
-#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
-
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                 cnst3, out0, out1, out2, out3)                          \
-  {                                                                      \
-    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
-  }
-#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
-
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                 cnst3, out0, out1, out2, out3)                          \
-  {                                                                      \
-    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
-
-/* Description : Dot product of word vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed word elements from 'mult0' are multiplied with
-                 signed word elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed double word.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
-  }
-#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
-
-/* Description : Dot product & addition of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
-  {                                                                         \
-    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
-    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
-  }
-#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
-
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                  cnst3, out0, out1, out2, out3)                          \
-  {                                                                       \
-    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product & addition of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
-  {                                                                         \
-    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
-    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
-  }
-#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
-
-/* Description : Dot product & addition of double word vector elements
-   Arguments   : Inputs  - mult0, mult1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed word element from 'mult0' is multiplied with itself
-                 producing an intermediate result twice the size of input
-                 i.e. signed double word
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
-  {                                                                         \
-    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
-    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
-  }
-#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
-
-/* Description : Minimum values between unsigned elements of
-                 either vector are copied to the output vector
-   Arguments   : Inputs  - in0, in1, min_vec
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Minimum of unsigned halfword element values from 'in0' and
-                 'min_vec' are written to output vector 'in0'
-*/
-#define MIN_UH2(RTYPE, in0, in1, min_vec)            \
-  {                                                  \
-    in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
-    in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
-  }
-#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
-
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
-  {                                                 \
-    MIN_UH2(RTYPE, in0, in1, min_vec);              \
-    MIN_UH2(RTYPE, in2, in3, min_vec);              \
-  }
-#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Clips all signed halfword elements of input vector
-                 between 0 & 255
-   Arguments   : Input  - in
-                 Output - out_m
-                 Return Type - signed halfword
-*/
-#define CLIP_SH_0_255(in)                              \
-  ({                                                   \
-    v8i16 max_m = __msa_ldi_h(255);                    \
-    v8i16 out_m;                                       \
-                                                       \
-    out_m = __msa_maxi_s_h((v8i16)in, 0);              \
-    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
-    out_m;                                             \
-  })
-#define CLIP_SH2_0_255(in0, in1) \
-  {                              \
-    in0 = CLIP_SH_0_255(in0);    \
-    in1 = CLIP_SH_0_255(in1);    \
-  }
-#define CLIP_SH4_0_255(in0, in1, in2, in3) \
-  {                                        \
-    CLIP_SH2_0_255(in0, in1);              \
-    CLIP_SH2_0_255(in2, in3);              \
-  }
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
-   Arguments   : Input  - in       (signed word vector)
-                 Output - sum_m    (i32 sum)
-                 Return Type - signed word (GP)
-   Details     : 4 signed word elements of 'in' vector are added together and
-                 the resulting integer sum is returned
-*/
-#define HADD_SW_S32(in)                            \
-  ({                                               \
-    v2i64 res0_m, res1_m;                          \
-    int32_t sum_m;                                 \
-                                                   \
-    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
-    res1_m = __msa_splati_d(res0_m, 1);            \
-    res0_m = res0_m + res1_m;                      \
-    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
-    sum_m;                                         \
-  })
-
-/* Description : Horizontal addition of 8 unsigned halfword elements
-   Arguments   : Inputs  - in       (unsigned halfword vector)
-                 Outputs - sum_m    (u32 sum)
-                 Return Type - unsigned word
-   Details     : 8 unsigned halfword elements of input vector are added
-                 together and the resulting integer sum is returned
-*/
-#define HADD_UH_U32(in)                               \
-  ({                                                  \
-    v4u32 res_m;                                      \
-    v2u64 res0_m, res1_m;                             \
-    uint32_t sum_m;                                   \
-                                                      \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
-    res0_m = __msa_hadd_u_d(res_m, res_m);            \
-    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m = res0_m + res1_m;                         \
-    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
-    sum_m;                                            \
-  })
-
-/* Description : Horizontal addition of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is added to
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
-  {                                                       \
-    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
-    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
-  }
-#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
-
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                 \
-    HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
-    HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
-  }
-#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is subtracted from
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
-  {                                                       \
-    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
-    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
-  }
-#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
-
-/* Description : SAD (Sum of Absolute Difference)
-   Arguments   : Inputs  - in0, in1, ref0, ref1
-                 Outputs - sad_m                 (halfword vector)
-                 Return Type - unsigned halfword
-   Details     : Absolute difference of all the byte elements from 'in0' with
-                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
-                 pairs are added together to generate 8 halfword results.
-*/
-#define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
-  ({                                                         \
-    v16u8 diff0_m, diff1_m;                                  \
-    v8u16 sad_m = { 0 };                                     \
-                                                             \
-    diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
-    diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
-                                                             \
-    sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
-    sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
-                                                             \
-    sad_m;                                                   \
-  })
-
-/* Description : Horizontal subtraction of signed halfword vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed odd halfword element from 'in0' is subtracted from
-                 even signed halfword element from 'in0' (pairwise) and the
-                 word result is written to 'out0'
-*/
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
-  {                                                       \
-    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
-    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
-  }
-#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
-
-/* Description : Set element n input vector to GPR value
-   Arguments   : Inputs - in0, in1, in2, in3
-                 Output - out
-                 Return Type - as per RTYPE
-   Details     : Set element 0 in vector 'out' to value specified in 'in0'
-*/
-#define INSERT_W2(RTYPE, in0, in1, out)              \
-  {                                                  \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
-  }
-#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
-
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
-  {                                                  \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
-  }
-#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
-#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
-
-#define INSERT_D2(RTYPE, in0, in1, out)              \
-  {                                                  \
-    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
-    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
-  }
-#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
-#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
-    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
-  }
-#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
-#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave even halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
-    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
-  }
-#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
-#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
-#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave even word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
-    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
-  }
-#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
-    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
-  }
-#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
-
-/* Description : Interleave left half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
-  }
-#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
-#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
-#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
-#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
-
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
-#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
-#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
-
-/* Description : Interleave left half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
-  }
-#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
-#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave left half of word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
-    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
-  }
-#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
-#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave right half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to out0.
-*/
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
-  }
-#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
-#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
-#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
-#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
-
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
-#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
-#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
-#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
-
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
-                in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
-                out5, out6, out7)                                              \
-  {                                                                            \
-    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
-            out3);                                                             \
-    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
-            out6, out7);                                                       \
-  }
-#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
-
-/* Description : Interleave right half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
-  }
-#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
-#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
-
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
-
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
-    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
-  }
-#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
-#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
-
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave right half of double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of double word elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
-  {                                                         \
-    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
-    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
-  }
-#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
-#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
-#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
-
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
-  {                                                                    \
-    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
-    out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
-  }
-#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
-
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
-#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave both left and right half of input vectors
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements from 'in0' and 'in1' are
-                 interleaved and written to 'out0'
-*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-  }
-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
-#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
-#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
-#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
-
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
-  }
-#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
-#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
-
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
-    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
-  }
-#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
-#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
-#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1) bits
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val + 1) bit range.
-                 The results are written in place
-*/
-#define SAT_UH2(RTYPE, in0, in1, sat_val)            \
-  {                                                  \
-    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
-    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
-  }
-#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
-
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
-  {                                                 \
-    SAT_UH2(RTYPE, in0, in1, sat_val);              \
-    SAT_UH2(RTYPE, in2, in3, sat_val)               \
-  }
-#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1) bits
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val + 1) bit range
-                 The results are written in place
-*/
-#define SAT_SH2(RTYPE, in0, in1, sat_val)            \
-  {                                                  \
-    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
-    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
-  }
-#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
-
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
-  {                                                 \
-    SAT_SH2(RTYPE, in0, in1, sat_val);              \
-    SAT_SH2(RTYPE, in2, in3, sat_val);              \
-  }
-#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Indexed halfword element values are replicated to all
-                 elements in output vector
-   Arguments   : Inputs  - in, idx0, idx1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : 'idx0' element value from 'in' vector is replicated to all
-                  elements in 'out0' vector
-                  Valid index range for halfword operation is 0-7
-*/
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
-  {                                                  \
-    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
-    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
-  }
-#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
-
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
-  {                                                                          \
-    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
-    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
-  }
-#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
-#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even byte elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' are copied to the left half of
-                 'out0' & even byte elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
-  }
-#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
-#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
-#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
-
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
-#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
-#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even halfword elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' are copied to the left half of
-                 'out0' & even halfword elements of 'in1' are copied to the
-                 right half of 'out0'.
-*/
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
-  }
-#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
-#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
-
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even double word elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double elements of 'in0' are copied to the left half of
-                 'out0' & even double elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
-    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
-  }
-#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
-#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
-
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
-
-/* Description : Each byte element is logically xor'ed with immediate 128
-   Arguments   : Inputs  - in0, in1
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and the result is stored in-place.
-*/
-#define XORI_B2_128(RTYPE, in0, in1)            \
-  {                                             \
-    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
-    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
-  }
-#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
-#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
-
-#define XORI_B3_128(RTYPE, in0, in1, in2)       \
-  {                                             \
-    XORI_B2_128(RTYPE, in0, in1);               \
-    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
-  }
-#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
-
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
-  {                                            \
-    XORI_B2_128(RTYPE, in0, in1);              \
-    XORI_B2_128(RTYPE, in2, in3);              \
-  }
-#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
-#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
-
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
-  {                                                           \
-    XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
-    XORI_B3_128(RTYPE, in4, in5, in6);                        \
-  }
-#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
-
-/* Description : Average of signed halfword elements -> (a + b) / 2
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3
-                 Return Type - as per RTYPE
-   Details     : Each signed halfword element from 'in0' is added to each
-                 signed halfword element of 'in1' with full precision resulting
-                 in one extra bit in the result. The result is then divided by
-                 2 and written to 'out0'
-*/
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
-    out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
-    out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
-    out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
-  }
-#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Addition of signed halfword elements and signed saturation
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'in0' are added to signed
-                 halfword elements of 'in1'. The result is then signed saturated
-                 between halfword data type range
-*/
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
-  {                                                       \
-    out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
-  }
-#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
-
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Shift left all elements of vector (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is left shifted by 'shift' and
-                 the result is written in-place.
-*/
-#define SLLI_4V(in0, in1, in2, in3, shift) \
-  {                                        \
-    in0 = in0 << shift;                    \
-    in1 = in1 << shift;                    \
-    in2 = in2 << shift;                    \
-    in3 = in3 << shift;                    \
-  }
-
-/* Description : Arithmetic shift right all elements of vector
-                 (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is a GP variable.
-*/
-#define SRA_4V(in0, in1, in2, in3, shift) \
-  {                                       \
-    in0 = in0 >> shift;                   \
-    in1 = in1 >> shift;                   \
-    in2 = in2 >> shift;                   \
-    in3 = in3 >> shift;                   \
-  }
-
-/* Description : Shift right arithmetic rounded words
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetically by
-                 the number of bits in the corresponding element in the vector
-                 'shift'. The last discarded bit is added to shifted value for
-                 rounding and the result is written in-place.
-                 'shift' is a vector.
-*/
-#define SRAR_W2(RTYPE, in0, in1, shift)                  \
-  {                                                      \
-    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
-    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
-  }
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
-  {                                               \
-    SRAR_W2(RTYPE, in0, in1, shift)               \
-    SRAR_W2(RTYPE, in2, in3, shift)               \
-  }
-#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
-
-/* Description : Shift right arithmetic rounded (immediate)
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetically by
-                 the value in 'shift'. The last discarded bit is added to the
-                 shifted value for rounding and the result is written in-place.
-                 'shift' is an immediate value.
-*/
-#define SRARI_H2(RTYPE, in0, in1, shift)           \
-  {                                                \
-    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
-    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
-  }
-#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
-#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
-
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
-  {                                                \
-    SRARI_H2(RTYPE, in0, in1, shift);              \
-    SRARI_H2(RTYPE, in2, in3, shift);              \
-  }
-#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
-#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
-
-#define SRARI_W2(RTYPE, in0, in1, shift)           \
-  {                                                \
-    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
-    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
-  }
-#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
-
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
-  {                                                \
-    SRARI_W2(RTYPE, in0, in1, shift);              \
-    SRARI_W2(RTYPE, in2, in3, shift);              \
-  }
-#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
-
-/* Description : Logical shift right all elements of vector (immediate)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - out0, out1, out2, out3
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is an immediate value.
-*/
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
-  {                                                                       \
-    out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
-    out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
-    out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
-    out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
-  }
-#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Multiplication of pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element from 'in0' is multiplied with elements from 'in1'
-                 and the result is written to 'out0'
-*/
-#define MUL2(in0, in1, in2, in3, out0, out1) \
-  {                                          \
-    out0 = in0 * in1;                        \
-    out1 = in2 * in3;                        \
-  }
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
-  {                                                                          \
-    MUL2(in0, in1, in2, in3, out0, out1);                                    \
-    MUL2(in4, in5, in6, in7, out2, out3);                                    \
-  }
-
-/* Description : Addition of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in0' is added to 'in1' and result is written
-                 to 'out0'.
-*/
-#define ADD2(in0, in1, in2, in3, out0, out1) \
-  {                                          \
-    out0 = in0 + in1;                        \
-    out1 = in2 + in3;                        \
-  }
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
-  {                                                                          \
-    ADD2(in0, in1, in2, in3, out0, out1);                                    \
-    ADD2(in4, in5, in6, in7, out2, out3);                                    \
-  }
-
-/* Description : Subtraction of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in1' is subtracted from 'in0' and result is
-                 written to 'out0'.
-*/
-#define SUB2(in0, in1, in2, in3, out0, out1) \
-  {                                          \
-    out0 = in0 - in1;                        \
-    out1 = in2 - in3;                        \
-  }
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
-  {                                                                          \
-    out0 = in0 - in1;                                                        \
-    out1 = in2 - in3;                                                        \
-    out2 = in4 - in5;                                                        \
-    out3 = in6 - in7;                                                        \
-  }
-
-/* Description : Sign extend halfword elements from right half of the vector
-   Arguments   : Input  - in    (halfword vector)
-                 Output - out   (sign extended word vector)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved with same vector 'in0' to generate
-                 4 word elements keeping sign intact
-*/
-#define UNPCK_R_SH_SW(in, out)                    \
-  {                                               \
-    v8i16 sign_m;                                 \
-                                                  \
-    sign_m = __msa_clti_s_h((v8i16)in, 0);        \
-    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
-  }
-
-/* Description : Zero extend unsigned byte elements to halfword elements
-   Arguments   : Input   - in          (unsigned byte vector)
-                 Outputs - out0, out1  (unsigned  halfword vectors)
-                 Return Type - signed halfword
-   Details     : Zero extended right half of vector is returned in 'out0'
-                 Zero extended left half of vector is returned in 'out1'
-*/
-#define UNPCK_UB_SH(in, out0, out1)      \
-  {                                      \
-    v16i8 zero_m = { 0 };                \
-                                         \
-    ILVRL_B2_SH(zero_m, in, out0, out1); \
-  }
-
-/* Description : Sign extend halfword elements from input vector and return
-                 the result in pair of vectors
-   Arguments   : Input   - in            (halfword vector)
-                 Outputs - out0, out1   (sign extended word vectors)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved right with same vector 'in0' to
-                 generate 4 signed word elements in 'out0'
-                 Then interleaved left with same vector 'in0' to
-                 generate 4 signed word elements in 'out1'
-*/
-#define UNPCK_SH_SW(in, out0, out1)       \
-  {                                       \
-    v8i16 tmp_m;                          \
-                                          \
-    tmp_m = __msa_clti_s_h((v8i16)in, 0); \
-    ILVRL_H2_SW(tmp_m, in, out0, out1);   \
-  }
-
-/* Description : Butterfly of 4 input vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                             \
-    out0 = in0 + in3;                                           \
-    out1 = in1 + in2;                                           \
-                                                                \
-    out2 = in1 - in2;                                           \
-    out3 = in0 - in3;                                           \
-  }
-
-/* Description : Butterfly of 8 input vectors
-   Arguments   : Inputs  - in0 ...  in7
-                 Outputs - out0 .. out7
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
-                    out3, out4, out5, out6, out7)                             \
-  {                                                                           \
-    out0 = in0 + in7;                                                         \
-    out1 = in1 + in6;                                                         \
-    out2 = in2 + in5;                                                         \
-    out3 = in3 + in4;                                                         \
-                                                                              \
-    out4 = in3 - in4;                                                         \
-    out5 = in2 - in5;                                                         \
-    out6 = in1 - in6;                                                         \
-    out7 = in0 - in7;                                                         \
-  }
-
-/* Description : Butterfly of 16 input vectors
-   Arguments   : Inputs  - in0 ...  in15
-                 Outputs - out0 .. out15
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
-                     in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
-                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
-                     out13, out14, out15)                                     \
-  {                                                                           \
-    out0 = in0 + in15;                                                        \
-    out1 = in1 + in14;                                                        \
-    out2 = in2 + in13;                                                        \
-    out3 = in3 + in12;                                                        \
-    out4 = in4 + in11;                                                        \
-    out5 = in5 + in10;                                                        \
-    out6 = in6 + in9;                                                         \
-    out7 = in7 + in8;                                                         \
-                                                                              \
-    out8 = in7 - in8;                                                         \
-    out9 = in6 - in9;                                                         \
-    out10 = in5 - in10;                                                       \
-    out11 = in4 - in11;                                                       \
-    out12 = in3 - in12;                                                       \
-    out13 = in2 - in13;                                                       \
-    out14 = in1 - in14;                                                       \
-    out15 = in0 - in15;                                                       \
-  }
-
-/* Description : Transpose input 8x8 byte block
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
-                        out1, out2, out3, out4, out5, out6, out7)              \
-  {                                                                            \
-    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
-    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
-                                                                               \
-    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
-               tmp3_m);                                                        \
-    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
-    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
-    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
-    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
-    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
-    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
-  }
-#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
-
-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
-                           in8, in9, in10, in11, in12, in13, in14, in15
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - unsigned byte
-*/
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
-                            in10, in11, in12, in13, in14, in15, out0, out1,   \
-                            out2, out3, out4, out5, out6, out7)               \
-  {                                                                           \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
-    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
-                                                                              \
-    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
-    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
-    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
-    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
-                                                                              \
-    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
-    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
-    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
-    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
-    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
-    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
-    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
-    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
-                                                                              \
-    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
-    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-                                                                              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
-    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-                                                                              \
-    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
-    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-                                                                              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
-    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-  }
-
-/* Description : Transpose 4x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                    \
-    v8i16 s0_m, s1_m;                                                  \
-                                                                       \
-    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
-    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
-    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
-    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
-  }
-
-/* Description : Transpose 4x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                           out2, out3, out4, out5, out6, out7)                 \
-  {                                                                            \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
-    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
-    v8i16 zero_m = { 0 };                                                      \
-                                                                               \
-    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
-               tmp3_n);                                                        \
-    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
-    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
-                                                                               \
-    out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
-    out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
-    out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
-    out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
-                                                                               \
-    out4 = zero_m;                                                             \
-    out5 = zero_m;                                                             \
-    out6 = zero_m;                                                             \
-    out7 = zero_m;                                                             \
-  }
-
-/* Description : Transpose 8x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                    \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
-                                                                       \
-    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
-    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
-    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
-    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
-  }
-
-/* Description : Transpose 8x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
-                       out1, out2, out3, out4, out5, out6, out7)            \
-  {                                                                         \
-    v8i16 s0_m, s1_m;                                                       \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
-                                                                            \
-    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
-    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
-    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
-    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
-    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
-             tmp7_m, out0, out2, out4, out6);                               \
-    out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
-    out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
-    out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
-    out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
-  }
-#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
-
-/* Description : Transpose 4x4 block with word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed word
-*/
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                    \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
-                                                                       \
-    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
-    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
-                                                                       \
-    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
-    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
-    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
-    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
-  }
-
-/* Description : Add block 4x4
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Least significant 4 bytes from each input vector are added to
-                 the destination bytes, clipped between 0-255 and stored.
-*/
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
-  {                                                              \
-    uint32_t src0_m, src1_m, src2_m, src3_m;                     \
-    v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
-    v16i8 dst0_m = { 0 };                                        \
-    v16i8 dst1_m = { 0 };                                        \
-    v16i8 zero_m = { 0 };                                        \
-                                                                 \
-    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
-    LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
-    INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
-    INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
-    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
-    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
-    CLIP_SH2_0_255(res0_m, res1_m);                              \
-    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
-    ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
-  }
-
-/* Description : Pack even elements of input vectors & xor with 128
-   Arguments   : Inputs - in0, in1
-                 Output - out_m
-                 Return Type - unsigned byte
-   Details     : Signed byte even elements from 'in0' and 'in1' are packed
-                 together in one vector and the resulting vector is xor'ed with
-                 128 to shift the range from signed to unsigned byte
-*/
-#define PCKEV_XORI128_UB(in0, in1)                        \
-  ({                                                      \
-    v16u8 out_m;                                          \
-                                                          \
-    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
-    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
-    out_m;                                                \
-  })
-
-/* Description : Converts inputs to unsigned bytes, interleave, average & store
-                 as 8x4 unsigned byte block
-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                          pdst, stride
-*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
-                                pdst, stride)                               \
-  {                                                                         \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
-    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
-    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
-  }
-
-/* Description : Pack even byte elements and store byte vector in destination
-                 memory
-   Arguments   : Inputs - in0, in1, pdst
-*/
-#define PCKEV_ST_SB(in0, in1, pdst)                \
-  {                                                \
-    v16i8 tmp_m;                                   \
-                                                   \
-    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
-    ST_SB(tmp_m, (pdst));                          \
-  }
-
-/* Description : Horizontal 2 tap filter kernel code
-   Arguments   : Inputs - in0, in1, mask, coeff, shift
-*/
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
-  ({                                                            \
-    v16i8 tmp0_m;                                               \
-    v8u16 tmp1_m;                                               \
-                                                                \
-    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
-    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
-    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
-                                                                \
-    tmp1_m;                                                     \
-  })
-#endif  // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
diff --git a/aom_dsp/mips/sad_msa.c b/aom_dsp/mips/sad_msa.c
deleted file mode 100644
index 01d4a5239..000000000
--- a/aom_dsp/mips/sad_msa.c
+++ /dev/null
@@ -1,800 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
-  {                                                        \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
-  }
-#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
-
-static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 diff;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad += __msa_hadd_u_h(diff, diff);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *ref, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src, src_stride, src0, src1);
-    src += (2 * src_stride);
-    LD_UB2(ref, ref_stride, ref0, ref1);
-    ref += (2 * ref_stride);
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, src_stride, src0, src1);
-    src += (2 * src_stride);
-    LD_UB2(ref, ref_stride, ref0, ref1);
-    ref += (2 * ref_stride);
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *ref, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *ref, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  uint32_t sad = 0;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = HADD_UH_U32(sad0);
-  sad += HADD_UH_U32(sad1);
-
-  return sad;
-}
-
-static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *const aref_ptr[4],
-                               int32_t ref_stride, int32_t height,
-                               uint32_t sad_array[4]) {
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-
-    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref0_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref1_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref2_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref3_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *const aref_ptr[4],
-                               int32_t ref_stride, int32_t height,
-                               uint32_t sad_array[4]) {
-  int32_t ht_cnt;
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
-  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref0_ptr += (4 * ref_stride);
-    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
-    ref1_ptr += (4 * ref_stride);
-    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
-    ref2_ptr += (4 * ref_stride);
-    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
-    ref3_ptr += (4 * ref_stride);
-
-    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *const aref_ptr[4],
-                                int32_t ref_stride, int32_t height,
-                                uint32_t sad_array[4]) {
-  int32_t ht_cnt;
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  v16u8 src, ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref0 = LD_UB(ref0_ptr);
-    ref0_ptr += ref_stride;
-    ref1 = LD_UB(ref1_ptr);
-    ref1_ptr += ref_stride;
-    ref2 = LD_UB(ref2_ptr);
-    ref2_ptr += ref_stride;
-    ref3 = LD_UB(ref3_ptr);
-    ref3_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref1);
-    sad1 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref2);
-    sad2 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref3);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref0 = LD_UB(ref0_ptr);
-    ref0_ptr += ref_stride;
-    ref1 = LD_UB(ref1_ptr);
-    ref1_ptr += ref_stride;
-    ref2 = LD_UB(ref2_ptr);
-    ref2_ptr += ref_stride;
-    ref3 = LD_UB(ref3_ptr);
-    ref3_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref1);
-    sad1 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref2);
-    sad2 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref3);
-    sad3 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *const aref_ptr[4],
-                                int32_t ref_stride, int32_t height,
-                                uint32_t sad_array[4]) {
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-
-    LD_UB2(ref0_ptr, 16, ref0, ref1);
-    ref0_ptr += ref_stride;
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(ref1_ptr, 16, ref0, ref1);
-    ref1_ptr += ref_stride;
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(ref2_ptr, 16, ref0, ref1);
-    ref2_ptr += ref_stride;
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(ref3_ptr, 16, ref0, ref1);
-    ref3_ptr += ref_stride;
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *const aref_ptr[4],
-                                int32_t ref_stride, int32_t height,
-                                uint32_t sad_array[4]) {
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-
-    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
-    ref0_ptr += ref_stride;
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
-    ref1_ptr += ref_stride;
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
-    ref2_ptr += ref_stride;
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
-    ref3_ptr += ref_stride;
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0_0);
-  sad_array[0] += HADD_UH_U32(sad0_1);
-  sad_array[1] = HADD_UH_U32(sad1_0);
-  sad_array[1] += HADD_UH_U32(sad1_1);
-  sad_array[2] = HADD_UH_U32(sad2_0);
-  sad_array[2] += HADD_UH_U32(sad2_1);
-  sad_array[3] = HADD_UH_U32(sad3_0);
-  sad_array[3] += HADD_UH_U32(sad3_1);
-}
-
-static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                  const uint8_t *ref_ptr, int32_t ref_stride,
-                                  int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 diff, pred, comp;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    comp = __msa_aver_u_b(pred, ref);
-    diff = __msa_asub_u_b(src, comp);
-    sad += __msa_hadd_u_h(diff, diff);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
-                                  const uint8_t *ref, int32_t ref_stride,
-                                  int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 diff0, diff1, pred0, pred1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
-    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
-                                   const uint8_t *ref, int32_t ref_stride,
-                                   int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 3); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * 16);
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
-    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
-    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
-    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * 16);
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
-    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
-    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
-    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
-                                   const uint8_t *ref, int32_t ref_stride,
-                                   int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
-  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 comp0, comp1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
-    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
-    ref += (4 * ref_stride);
-
-    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
-    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
-    sec_pred += (4 * 32);
-
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
-    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
-    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
-    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
-    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
-    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
-    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
-                                   const uint8_t *ref, int32_t ref_stride,
-                                   int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 comp0, comp1, comp2, comp3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-  }
-
-  sad = __msa_hadd_u_w(sad0, sad0);
-  sad += __msa_hadd_u_w(sad1, sad1);
-
-  return HADD_SW_S32(sad);
-}
-
-#define AOM_SAD_4xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                   const uint8_t *ref, int32_t ref_stride) { \
-    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_8xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                   const uint8_t *ref, int32_t ref_stride) { \
-    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_16xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                    const uint8_t *ref, int32_t ref_stride) { \
-    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_32xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                    const uint8_t *ref, int32_t ref_stride) { \
-    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_64xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                    const uint8_t *ref, int32_t ref_stride) { \
-    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[4],           \
-                                  int32_t ref_stride, uint32_t sads[4]) { \
-    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_8xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[4],           \
-                                  int32_t ref_stride, uint32_t sads[4]) { \
-    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_16xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[4],           \
-                                   int32_t ref_stride, uint32_t sads[4]) { \
-    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_32xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[4],           \
-                                   int32_t ref_stride, uint32_t sads[4]) { \
-    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_64xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[4],           \
-                                   int32_t ref_stride, uint32_t sads[4]) { \
-    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_AVGSAD_4xHEIGHT_MSA(height)                                        \
-  uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
-                                       const uint8_t *ref, int32_t ref_stride, \
-                                       const uint8_t *second_pred) {           \
-    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
-                             second_pred);                                     \
-  }
-
-#define AOM_AVGSAD_8xHEIGHT_MSA(height)                                        \
-  uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
-                                       const uint8_t *ref, int32_t ref_stride, \
-                                       const uint8_t *second_pred) {           \
-    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
-                             second_pred);                                     \
-  }
-
-#define AOM_AVGSAD_16xHEIGHT_MSA(height)                                \
-  uint32_t aom_sad16x##height##_avg_msa(                                \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
-      int32_t ref_stride, const uint8_t *second_pred) {                 \
-    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
-                              second_pred);                             \
-  }
-
-#define AOM_AVGSAD_32xHEIGHT_MSA(height)                                \
-  uint32_t aom_sad32x##height##_avg_msa(                                \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
-      int32_t ref_stride, const uint8_t *second_pred) {                 \
-    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
-                              second_pred);                             \
-  }
-
-#define AOM_AVGSAD_64xHEIGHT_MSA(height)                                \
-  uint32_t aom_sad64x##height##_avg_msa(                                \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
-      int32_t ref_stride, const uint8_t *second_pred) {                 \
-    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
-                              second_pred);                             \
-  }
-
-/* clang-format off */
-// 64x64
-AOM_SAD_64xHEIGHT_MSA(64)
-AOM_SAD_64xHEIGHTx4D_MSA(64)
-AOM_AVGSAD_64xHEIGHT_MSA(64)
-
-// 64x32
-AOM_SAD_64xHEIGHT_MSA(32)
-AOM_SAD_64xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_64xHEIGHT_MSA(32)
-
-// 32x64
-AOM_SAD_32xHEIGHT_MSA(64)
-AOM_SAD_32xHEIGHTx4D_MSA(64)
-AOM_AVGSAD_32xHEIGHT_MSA(64)
-
-// 32x32
-AOM_SAD_32xHEIGHT_MSA(32)
-AOM_SAD_32xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_32xHEIGHT_MSA(32)
-
-// 32x16
-AOM_SAD_32xHEIGHT_MSA(16)
-AOM_SAD_32xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_32xHEIGHT_MSA(16)
-
-// 16x32
-AOM_SAD_16xHEIGHT_MSA(32)
-AOM_SAD_16xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_16xHEIGHT_MSA(32)
-
-// 16x16
-AOM_SAD_16xHEIGHT_MSA(16)
-AOM_SAD_16xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_16xHEIGHT_MSA(16)
-
-// 16x8
-AOM_SAD_16xHEIGHT_MSA(8)
-AOM_SAD_16xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_16xHEIGHT_MSA(8)
-
-// 8x16
-AOM_SAD_8xHEIGHT_MSA(16)
-AOM_SAD_8xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_8xHEIGHT_MSA(16)
-
-// 8x8
-AOM_SAD_8xHEIGHT_MSA(8)
-AOM_SAD_8xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_8xHEIGHT_MSA(8)
-
-// 8x4
-AOM_SAD_8xHEIGHT_MSA(4)
-AOM_SAD_8xHEIGHTx4D_MSA(4)
-AOM_AVGSAD_8xHEIGHT_MSA(4)
-
-// 4x8
-AOM_SAD_4xHEIGHT_MSA(8)
-AOM_SAD_4xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_4xHEIGHT_MSA(8)
-
-// 4x4
-AOM_SAD_4xHEIGHT_MSA(4)
-AOM_SAD_4xHEIGHTx4D_MSA(4)
-AOM_AVGSAD_4xHEIGHT_MSA(4)
-    /* clang-format on */
diff --git a/aom_dsp/mips/sub_pixel_variance_msa.c b/aom_dsp/mips/sub_pixel_variance_msa.c
deleted file mode 100644
index 810b6efaa..000000000
--- a/aom_dsp/mips/sub_pixel_variance_msa.c
+++ /dev/null
@@ -1,1792 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/variance.h"
-
-#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
-  {                                                                 \
-    v16u8 src_l0_m, src_l1_m;                                       \
-    v8i16 res_l0_m, res_l1_m;                                       \
-                                                                    \
-    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
-    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
-    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-                                                                    \
-    sub += res_l0_m + res_l1_m;                                     \
-  }
-
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int32_t ref_stride,
-                                        const uint8_t *sec_pred, int32_t height,
-                                        int32_t *diff) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 pred, src = { 0 };
-  v16u8 ref = { 0 };
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int32_t ref_stride,
-                                        const uint8_t *sec_pred, int32_t height,
-                                        int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
-                                         int32_t src_stride,
-                                         const uint8_t *ref_ptr,
-                                         int32_t ref_stride,
-                                         const uint8_t *sec_pred,
-                                         int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src, ref, pred;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
-                                         int32_t src_stride,
-                                         const uint8_t *ref_ptr,
-                                         int32_t ref_stride,
-                                         const uint8_t *sec_pred,
-                                         int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1, pred0, pred1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1, pred0, pred1;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v8i16 avg2 = { 0 };
-  v8i16 avg3 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 32; ht_cnt--;) {
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  vec += __msa_hadd_s_w(avg2, avg2);
-  vec += __msa_hadd_s_w(avg3, avg3);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_4width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 filt0, ref = { 0 };
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
-    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
-    CALC_MSE_AVG_B(src0, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 filt0, out, ref0, ref1, ref2, ref3;
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-    CALC_MSE_AVG_B(out, ref0, var, avg);
-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
-    CALC_MSE_AVG_B(out, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v16u8 dst0, dst1, dst2, dst3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    dst += (4 * dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, dst0, var, avg);
-    CALC_MSE_AVG_B(src1, dst1, var, avg);
-    CALC_MSE_AVG_B(src2, dst2, var, avg);
-    CALC_MSE_AVG_B(src3, dst3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4, out;
-  v16u8 src10_r, src32_r, src21_r, src43_r;
-  v16u8 ref = { 0 };
-  v16u8 src2110, src4332;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-  v8u16 tmp0, tmp1;
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-               src32_r, src43_r);
-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1, out2, out3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    src0 = src4;
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out, ref = { 0 };
-  v16u8 filt_vt, filt_hz, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
-  v8u16 tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt_vt, filt_hz, vec0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
-  v8u16 tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  LD_UB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-    CALC_MSE_AVG_B(src2, ref2, var, avg);
-    CALC_MSE_AVG_B(src3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
-                                             filter_horiz, filter_vert, height,
-                                             &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
-                                             filter_horiz, filter_vert, height,
-                                             &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 out, pred, filt0, ref = { 0 };
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
-    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 out, pred, filt0;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref0, var, avg);
-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff, int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v16u8 dst0, dst1, dst2, dst3;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v16u8 pred0, pred1, pred2, pred3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    dst += (4 * dst_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
-                tmp2, tmp3);
-    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
-                tmp2, tmp3);
-
-    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
-    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
-    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
-    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                      sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 src10_r, src32_r, src21_r, src43_r;
-  v16u8 out, pred, ref = { 0 };
-  v16u8 src2110, src4332, filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-  v8u16 tmp0, tmp1;
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-               src32_r, src43_r);
-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, filt0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff, int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1, out2, out3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    src0 = src4;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
-                out2, out3);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                      sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 out, pred, ref = { 0 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 pred0, pred1, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v16u8 out0, out1, out2, out3;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  LD_UB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
-                out2, out3);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                       sec_pred, filter_horiz, filter_vert,
-                                       height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                         sec_pred, filter_horiz, filter_vert,
-                                         height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                         sec_pred, filter_horiz, filter_vert,
-                                         height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
-  uint32_t aom_sub_pixel_variance##wd##x##ht##_msa(                           \
-      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
-      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
-      uint32_t *sse) {                                                        \
-    int32_t diff;                                                             \
-    uint32_t var;                                                             \
-    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
-    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
-                                                                              \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
-            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
-      } else {                                                                \
-        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
-            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
-      }                                                                       \
-                                                                              \
-      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
-    } else {                                                                  \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
-            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
-                                                                              \
-        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
-      } else {                                                                \
-        var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
-                                            sse);                             \
-      }                                                                       \
-    }                                                                         \
-                                                                              \
-    return var;                                                               \
-  }
-
-/* clang-format off */
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
-/* clang-format on */
-
-#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
-  uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
-      uint32_t *sse, const uint8_t *sec_pred) {                               \
-    int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
-    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
-                                                                              \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
-            v_filter, ht, &diff);                                             \
-      } else {                                                                \
-        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
-            &diff);                                                           \
-      }                                                                       \
-    } else {                                                                  \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
-            &diff);                                                           \
-      } else {                                                                \
-        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
-                                            ref_stride, sec_pred, ht, &diff); \
-      }                                                                       \
-    }                                                                         \
-                                                                              \
-    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
-  }
-
-/* clang-format off */
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
-/* clang-format on */
-
-uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
-                                             int32_t src_stride,
-                                             int32_t xoffset, int32_t yoffset,
-                                             const uint8_t *ref_ptr,
-                                             int32_t ref_stride, uint32_t *sse,
-                                             const uint8_t *sec_pred) {
-  int32_t diff;
-  const uint8_t *h_filter = bilinear_filters_2t[xoffset];
-  const uint8_t *v_filter = bilinear_filters_2t[yoffset];
-
-  if (yoffset) {
-    if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
-          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
-          v_filter, 64, &diff);
-    } else {
-      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
-                                                  ref_stride, sec_pred,
-                                                  v_filter, 64, &diff);
-    }
-  } else {
-    if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
-                                                  ref_stride, sec_pred,
-                                                  h_filter, 64, &diff);
-    } else {
-      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
-                                    sec_pred, &diff);
-    }
-  }
-
-  return VARIANCE_32Wx64H(*sse, diff);
-}
-
-#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
-  uint32_t aom_sub_pixel_avg_variance64x##ht##_msa(                           \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
-      uint32_t *sse, const uint8_t *sec_pred) {                               \
-    int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
-    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
-                                                                              \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
-            v_filter, ht, &diff);                                             \
-      } else {                                                                \
-        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
-            &diff);                                                           \
-      }                                                                       \
-    } else {                                                                  \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
-            &diff);                                                           \
-      } else {                                                                \
-        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
-                                          ref_stride, sec_pred, &diff);       \
-      }                                                                       \
-    }                                                                         \
-                                                                              \
-    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
-  }
-
-/* clang-format off */
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
-/* clang-format on */
diff --git a/aom_dsp/mips/subtract_msa.c b/aom_dsp/mips/subtract_msa.c
deleted file mode 100644
index bfed773ac..000000000
--- a/aom_dsp/mips/subtract_msa.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
-                            const uint8_t *pred_ptr, int32_t pred_stride,
-                            int16_t *diff_ptr, int32_t diff_stride) {
-  uint32_t src0, src1, src2, src3;
-  uint32_t pred0, pred1, pred2, pred3;
-  v16i8 src = { 0 };
-  v16i8 pred = { 0 };
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  LW4(src_ptr, src_stride, src0, src1, src2, src3);
-  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
-  INSERT_W4_SB(src0, src1, src2, src3, src);
-  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
-  ILVRL_B2_UB(src, pred, src_l0, src_l1);
-  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
-}
-
-static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                            const uint8_t *pred_ptr, int32_t pred_stride,
-                            int16_t *diff_ptr, int32_t diff_stride) {
-  uint32_t loop_cnt;
-  uint64_t src0, src1, pred0, pred1;
-  v16i8 src = { 0 };
-  v16i8 pred = { 0 };
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (loop_cnt = 4; loop_cnt--;) {
-    LD2(src_ptr, src_stride, src0, src1);
-    src_ptr += (2 * src_stride);
-    LD2(pred_ptr, pred_stride, pred0, pred1);
-    pred_ptr += (2 * pred_stride);
-
-    INSERT_D2_SB(src0, src1, src);
-    INSERT_D2_SB(pred0, pred1, pred);
-    ILVRL_B2_UB(src, pred, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
-    diff_ptr += (2 * diff_stride);
-  }
-}
-
-static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *pred, int32_t pred_stride,
-                              int16_t *diff, int32_t diff_stride) {
-  int8_t count;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (count = 2; count--;) {
-    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-    src += (8 * src_stride);
-
-    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
-           pred7);
-    pred += (8 * pred_stride);
-
-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-  }
-}
-
-static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *pred, int32_t pred_stride,
-                              int16_t *diff, int32_t diff_stride) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (loop_cnt = 8; loop_cnt--;) {
-    LD_SB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_SB2(src, 16, src2, src3);
-    src += src_stride;
-    LD_SB2(src, 16, src4, src5);
-    src += src_stride;
-    LD_SB2(src, 16, src6, src7);
-    src += src_stride;
-
-    LD_SB2(pred, 16, pred0, pred1);
-    pred += pred_stride;
-    LD_SB2(pred, 16, pred2, pred3);
-    pred += pred_stride;
-    LD_SB2(pred, 16, pred4, pred5);
-    pred += pred_stride;
-    LD_SB2(pred, 16, pred6, pred7);
-    pred += pred_stride;
-
-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-  }
-}
-
-static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *pred, int32_t pred_stride,
-                              int16_t *diff, int32_t diff_stride) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (loop_cnt = 32; loop_cnt--;) {
-    LD_SB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_SB4(src, 16, src4, src5, src6, src7);
-    src += src_stride;
-
-    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
-    pred += pred_stride;
-    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
-    pred += pred_stride;
-
-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 32, 8);
-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 48, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 32, 8);
-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 48, 8);
-    diff += diff_stride;
-  }
-}
-
-void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
-                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                            ptrdiff_t pred_stride) {
-  if (rows == cols) {
-    switch (rows) {
-      case 4:
-        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                        diff_stride);
-        break;
-      case 8:
-        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                        diff_stride);
-        break;
-      case 16:
-        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                          diff_stride);
-        break;
-      case 32:
-        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                          diff_stride);
-        break;
-      case 64:
-        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                          diff_stride);
-        break;
-      default:
-        aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
-                             src_stride, pred_ptr, pred_stride);
-        break;
-    }
-  } else {
-    aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
-                         pred_ptr, pred_stride);
-  }
-}
diff --git a/aom_dsp/mips/variance_msa.c b/aom_dsp/mips/variance_msa.c
deleted file mode 100644
index 065c09ac5..000000000
--- a/aom_dsp/mips/variance_msa.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define CALC_MSE_B(src, ref, var)                                   \
-  {                                                                 \
-    v16u8 src_l0_m, src_l1_m;                                       \
-    v8i16 res_l0_m, res_l1_m;                                       \
-                                                                    \
-    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
-    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
-    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-  }
-
-#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
-  {                                                                 \
-    v16u8 src_l0_m, src_l1_m;                                       \
-    v8i16 res_l0_m, res_l1_m;                                       \
-                                                                    \
-    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
-    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
-    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-                                                                    \
-    sub += res_l0_m + res_l1_m;                                     \
-  }
-
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                    const uint8_t *ref_ptr, int32_t ref_stride,
-                                    int32_t height, int32_t *diff) {
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  int32_t ht_cnt;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                    const uint8_t *ref_ptr, int32_t ref_stride,
-                                    int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                     const uint8_t *ref_ptr, int32_t ref_stride,
-                                     int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src, ref;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                     const uint8_t *ref_ptr, int32_t ref_stride,
-                                     int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                   const uint8_t *ref_ptr, int32_t ref_stride,
-                                   int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                   const uint8_t *ref_ptr, int32_t ref_stride,
-                                   int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                   const uint8_t *ref_ptr, int32_t ref_stride,
-                                   int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v8i16 avg2 = { 0 };
-  v8i16 avg3 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 32; ht_cnt--;) {
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  vec += __msa_hadd_s_w(avg2, avg2);
-  vec += __msa_hadd_s_w(avg3, avg3);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t get_mb_ss_msa(const int16_t *src) {
-  uint32_t sum, cnt;
-  v8i16 src0, src1, src2, src3;
-  v4i32 src0_l, src1_l, src2_l, src3_l;
-  v4i32 src0_r, src1_r, src2_r, src3_r;
-  v2i64 sq_src_l = { 0 };
-  v2i64 sq_src_r = { 0 };
-
-  for (cnt = 8; cnt--;) {
-    LD_SH4(src, 8, src0, src1, src2, src3);
-    src += 4 * 8;
-
-    UNPCK_SH_SW(src0, src0_l, src0_r);
-    UNPCK_SH_SW(src1, src1_l, src1_r);
-    UNPCK_SH_SW(src2, src2_l, src2_r);
-    UNPCK_SH_SW(src3, src3_l, src3_r);
-
-    DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
-    DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
-    DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
-    DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
-  }
-
-  sq_src_l += __msa_splati_d(sq_src_l, 1);
-  sq_src_r += __msa_splati_d(sq_src_r, 1);
-
-  sum = __msa_copy_s_d(sq_src_l, 0);
-  sum += __msa_copy_s_d(sq_src_r, 0);
-
-  return sum;
-}
-
-static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    CALC_MSE_B(src, ref, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *ref_ptr, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src, ref;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *ref_ptr, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *ref_ptr, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src2, ref2, var);
-    CALC_MSE_B(src1, ref1, var);
-    CALC_MSE_B(src3, ref3, var);
-
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src2, ref2, var);
-    CALC_MSE_B(src1, ref1, var);
-    CALC_MSE_B(src3, ref3, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride) {
-  uint32_t err = 0;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16i8 src = { 0 };
-  v16i8 ref = { 0 };
-  v16u8 src_vec0, src_vec1;
-  v8i16 diff0, diff1;
-  v4i32 err0 = { 0 };
-  v4i32 err1 = { 0 };
-
-  LW4(src_ptr, src_stride, src0, src1, src2, src3);
-  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-  INSERT_W4_SB(src0, src1, src2, src3, src);
-  INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
-  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
-  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
-  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
-  err = HADD_SW_S32(err0);
-  err += HADD_SW_S32(err1);
-
-  return err;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define AOM_VARIANCE_WDXHT_MSA(wd, ht)                                         \
-  uint32_t aom_variance##wd##x##ht##_msa(                                      \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
-      int32_t ref_stride, uint32_t *sse) {                                     \
-    int32_t diff;                                                              \
-                                                                               \
-    *sse =                                                                     \
-        sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
-                                                                               \
-    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
-  }
-
-/* clang-format off */
-AOM_VARIANCE_WDXHT_MSA(4, 4)
-AOM_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_VARIANCE_WDXHT_MSA(8, 4)
-AOM_VARIANCE_WDXHT_MSA(8, 8)
-AOM_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_VARIANCE_WDXHT_MSA(16, 8)
-AOM_VARIANCE_WDXHT_MSA(16, 16)
-AOM_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_VARIANCE_WDXHT_MSA(32, 16)
-AOM_VARIANCE_WDXHT_MSA(32, 32)
-/* clang-format on */
-
-uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               uint32_t *sse) {
-  int32_t diff;
-
-  *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
-
-  return VARIANCE_32Wx64H(*sse, diff);
-}
-
-uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               uint32_t *sse) {
-  int32_t diff;
-
-  *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
-
-  return VARIANCE_64Wx32H(*sse, diff);
-}
-
-uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               uint32_t *sse) {
-  int32_t diff;
-
-  *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
-
-  return VARIANCE_64Wx64H(*sse, diff);
-}
-
-uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
-                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
-  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
-
-  return *sse;
-}
-
-uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride,
-                         uint32_t *sse) {
-  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
-
-  return *sse;
-}
-
-uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride,
-                         uint32_t *sse) {
-  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
-
-  return *sse;
-}
-
-uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
-                          const uint8_t *ref, int32_t ref_stride,
-                          uint32_t *sse) {
-  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
-
-  return *sse;
-}
-
-void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
-                       const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
-                       int32_t *sum) {
-  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
-}
-
-void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
-                         int32_t *sum) {
-  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
-}
-
-uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index 70fdb7bca..ed4508739 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -610,8 +610,9 @@ int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
           Gxy += gx * gy;
           Gyy += gy * gy;
 
-          mean += block[yi * block_size + xi];
-          var += block[yi * block_size + xi] * block[yi * block_size + xi];
+          const double value = block[yi * block_size + xi];
+          mean += value;
+          var += value * value;
         }
       }
       mean /= (block_size - 2) * (block_size - 2);
diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index d846a102e..08fb69c82 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c
@@ -26,59 +26,41 @@ double aom_sse_to_psnr(double samples, double peak, double sse) {
   }
 }
 
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
   int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
   }
+  return sse;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int w,
+                                    int h) {
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t tsum = 0;
-  uint64_t tsse = 0;
+  int64_t sse = 0;
   for (int i = 0; i < h; ++i) {
-    int32_t lsum = 0;
     for (int j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
-      lsum += diff;
-      tsse += (uint32_t)(diff * diff);
+      sse += diff * diff;
     }
-    tsum += lsum;
     a += a_stride;
     b += b_stride;
   }
-  *sum = tsum;
-  *sse = tsse;
+  return sse;
 }
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
@@ -86,26 +68,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
   }
 
   if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
   }
 
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
@@ -147,22 +126,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   int x, y;
   const int dw = width % 16;
   const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
+
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+                                      b_stride, dw, height);
   }
   if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+                                      &b[(height - dh) * b_stride], b_stride,
+                                      width - dw, dh);
   }
+
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
diff --git a/aom_dsp/rect.h b/aom_dsp/rect.h
new file mode 100644
index 000000000..11bdaca97
--- /dev/null
+++ b/aom_dsp/rect.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_RECT_H_
+#define AOM_AOM_DSP_RECT_H_
+
+#include "config/aom_config.h"
+
+#include <stdbool.h>
+
+// Struct representing a rectangle of pixels.
+// The axes are inclusive-exclusive, ie. the point (top, left) is included
+// in the rectangle but (bottom, right) is not.
+typedef struct {
+  int left, right, top, bottom;
+} PixelRect;
+
+static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; }
+
+static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; }
+
+static INLINE bool is_inside_rect(const int x, const int y,
+                                  const PixelRect *r) {
+  return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom);
+}
+
+#endif  // AOM_AOM_DSP_RECT_H_
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 94260ce11..5b7b0e4d0 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -120,70 +120,93 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
     }                                                                         \
   }
 #endif  // CONFIG_REALTIME_ONLY
+// Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable.
+#define SAD_MXNX3D(m, n)                                                      \
+  void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,            \
+                               const uint8_t *const ref_array[4],             \
+                               int ref_stride, uint32_t sad_array[4]) {       \
+    aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \
+  }
 
 // 128x128
 SADMXN(128, 128)
 SAD_MXNX4D(128, 128)
+SAD_MXNX3D(128, 128)
 
 // 128x64
 SADMXN(128, 64)
 SAD_MXNX4D(128, 64)
+SAD_MXNX3D(128, 64)
 
 // 64x128
 SADMXN(64, 128)
 SAD_MXNX4D(64, 128)
+SAD_MXNX3D(64, 128)
 
 // 64x64
 SADMXN(64, 64)
 SAD_MXNX4D(64, 64)
+SAD_MXNX3D(64, 64)
 
 // 64x32
 SADMXN(64, 32)
 SAD_MXNX4D(64, 32)
+SAD_MXNX3D(64, 32)
 
 // 32x64
 SADMXN(32, 64)
 SAD_MXNX4D(32, 64)
+SAD_MXNX3D(32, 64)
 
 // 32x32
 SADMXN(32, 32)
 SAD_MXNX4D(32, 32)
+SAD_MXNX3D(32, 32)
 
 // 32x16
 SADMXN(32, 16)
 SAD_MXNX4D(32, 16)
+SAD_MXNX3D(32, 16)
 
 // 16x32
 SADMXN(16, 32)
 SAD_MXNX4D(16, 32)
+SAD_MXNX3D(16, 32)
 
 // 16x16
 SADMXN(16, 16)
 SAD_MXNX4D(16, 16)
+SAD_MXNX3D(16, 16)
 
 // 16x8
 SADMXN(16, 8)
 SAD_MXNX4D(16, 8)
+SAD_MXNX3D(16, 8)
 
 // 8x16
 SADMXN(8, 16)
 SAD_MXNX4D(8, 16)
+SAD_MXNX3D(8, 16)
 
 // 8x8
 SADMXN(8, 8)
 SAD_MXNX4D(8, 8)
+SAD_MXNX3D(8, 8)
 
 // 8x4
 SADMXN(8, 4)
 SAD_MXNX4D(8, 4)
+SAD_MXNX3D(8, 4)
 
 // 4x8
 SADMXN(4, 8)
 SAD_MXNX4D(4, 8)
+SAD_MXNX3D(4, 8)
 
 // 4x4
 SADMXN(4, 4)
 SAD_MXNX4D(4, 4)
+SAD_MXNX3D(4, 4)
 
 SAD_MXH(128)
 SAD_MXH(64)
@@ -204,6 +227,14 @@ SADMXN(16, 64)
 SAD_MXNX4D(16, 64)
 SADMXN(64, 16)
 SAD_MXNX4D(64, 16)
+#if !CONFIG_REALTIME_ONLY
+SAD_MXNX3D(4, 16)
+SAD_MXNX3D(16, 4)
+SAD_MXNX3D(8, 32)
+SAD_MXNX3D(32, 8)
+SAD_MXNX3D(16, 64)
+SAD_MXNX3D(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_AV1_HIGHBITDEPTH
 static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
@@ -291,70 +322,94 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
                                     2 * ref_stride, (m), (n / 2));           \
     }                                                                        \
   }
+// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
+#define HIGHBD_SAD_MXNX3D(m, n)                                              \
+  void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,    \
+                                      const uint8_t *const ref_array[],      \
+                                      int ref_stride, uint32_t *sad_array) { \
+    aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride,     \
+                                 sad_array);                                 \
+  }
 
 // 128x128
 HIGHBD_SADMXN(128, 128)
 HIGHBD_SAD_MXNX4D(128, 128)
+HIGHBD_SAD_MXNX3D(128, 128)
 
 // 128x64
 HIGHBD_SADMXN(128, 64)
 HIGHBD_SAD_MXNX4D(128, 64)
+HIGHBD_SAD_MXNX3D(128, 64)
 
 // 64x128
 HIGHBD_SADMXN(64, 128)
 HIGHBD_SAD_MXNX4D(64, 128)
+HIGHBD_SAD_MXNX3D(64, 128)
 
 // 64x64
 HIGHBD_SADMXN(64, 64)
 HIGHBD_SAD_MXNX4D(64, 64)
+HIGHBD_SAD_MXNX3D(64, 64)
 
 // 64x32
 HIGHBD_SADMXN(64, 32)
 HIGHBD_SAD_MXNX4D(64, 32)
+HIGHBD_SAD_MXNX3D(64, 32)
 
 // 32x64
 HIGHBD_SADMXN(32, 64)
 HIGHBD_SAD_MXNX4D(32, 64)
+HIGHBD_SAD_MXNX3D(32, 64)
 
 // 32x32
 HIGHBD_SADMXN(32, 32)
 HIGHBD_SAD_MXNX4D(32, 32)
+HIGHBD_SAD_MXNX3D(32, 32)
 
 // 32x16
 HIGHBD_SADMXN(32, 16)
 HIGHBD_SAD_MXNX4D(32, 16)
+HIGHBD_SAD_MXNX3D(32, 16)
 
 // 16x32
 HIGHBD_SADMXN(16, 32)
 HIGHBD_SAD_MXNX4D(16, 32)
+HIGHBD_SAD_MXNX3D(16, 32)
 
 // 16x16
 HIGHBD_SADMXN(16, 16)
 HIGHBD_SAD_MXNX4D(16, 16)
+HIGHBD_SAD_MXNX3D(16, 16)
 
 // 16x8
 HIGHBD_SADMXN(16, 8)
 HIGHBD_SAD_MXNX4D(16, 8)
+HIGHBD_SAD_MXNX3D(16, 8)
 
 // 8x16
 HIGHBD_SADMXN(8, 16)
 HIGHBD_SAD_MXNX4D(8, 16)
+HIGHBD_SAD_MXNX3D(8, 16)
 
 // 8x8
 HIGHBD_SADMXN(8, 8)
 HIGHBD_SAD_MXNX4D(8, 8)
+HIGHBD_SAD_MXNX3D(8, 8)
 
 // 8x4
 HIGHBD_SADMXN(8, 4)
 HIGHBD_SAD_MXNX4D(8, 4)
+HIGHBD_SAD_MXNX3D(8, 4)
 
 // 4x8
 HIGHBD_SADMXN(4, 8)
 HIGHBD_SAD_MXNX4D(4, 8)
+HIGHBD_SAD_MXNX3D(4, 8)
 
 // 4x4
 HIGHBD_SADMXN(4, 4)
 HIGHBD_SAD_MXNX4D(4, 4)
+HIGHBD_SAD_MXNX3D(4, 4)
 
 HIGHBD_SADMXN(4, 16)
 HIGHBD_SAD_MXNX4D(4, 16)
@@ -368,4 +423,13 @@ HIGHBD_SADMXN(16, 64)
 HIGHBD_SAD_MXNX4D(16, 64)
 HIGHBD_SADMXN(64, 16)
 HIGHBD_SAD_MXNX4D(64, 16)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX3D(4, 16)
+HIGHBD_SAD_MXNX3D(16, 4)
+HIGHBD_SAD_MXNX3D(8, 32)
+HIGHBD_SAD_MXNX3D(32, 8)
+HIGHBD_SAD_MXNX3D(16, 64)
+HIGHBD_SAD_MXNX3D(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index c404015ef..32b51c956 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -36,7 +36,7 @@ SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
 }
 
 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return _mm_set_epi32(a, b, c, d);
+  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
 }
 
 SIMD_INLINE v128 v128_load_aligned(const void *p) {
@@ -81,16 +81,16 @@ SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
 
 SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
 
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
 
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
 
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
 
 SIMD_INLINE v128 v128_dup_64(uint64_t x) {
   // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
-  return _mm_set_epi32((uint32_t)(x >> 32), (uint32_t)x, (uint32_t)(x >> 32),
-                       (uint32_t)x);
+  return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32),
+                       (int32_t)x);
 }
 
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
@@ -304,7 +304,7 @@ SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
   v128 output;
   unsigned char *input = (unsigned char *)&x;
   unsigned char *index = (unsigned char *)&pattern;
-  char *selected = (char *)&output;
+  unsigned char *selected = (unsigned char *)&output;
   int counter;
 
   for (counter = 0; counter < 16; counter++) {
@@ -534,58 +534,58 @@ SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
   return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
+  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
   return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
                          _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
 }
 
 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
-  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
-  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
   // _mm_sra_epi64 is missing in gcc?
-  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
-                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
-  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+  return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c),
+                      (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c));
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
@@ -593,9 +593,9 @@ SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
 #define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
 #define v128_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v128_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
 #define v128_shr_n_s8(a, c)                                         \
   _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
                   _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index 47003af31..66cfda31e 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -380,7 +380,7 @@ SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
 }
 
 SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
-  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+  return ((uint32_t)(a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
          ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
          ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
          ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index eb5eaf063..894ddee16 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -57,7 +57,7 @@ SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
 }
 
 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return _mm256_set_epi64x(a, b, c, d);
+  return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d);
 }
 
 SIMD_INLINE v256 v256_load_aligned(const void *p) {
@@ -78,13 +78,15 @@ SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
 
 SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
 
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); }
 
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
 
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); }
 
-SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  return _mm256_set1_epi64x((int64_t)x);
+}
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
 
@@ -543,7 +545,9 @@ SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
 
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (uint32_t)_mm256_movemask_epi8(a);
+}
 
 SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
   return _mm256_blendv_epi8(a, b, c);
@@ -596,56 +600,56 @@ SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
 }
 
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
-                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)),
+                          _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
   return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
-                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+                          _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
+  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
   return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
                             _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
 }
 
 SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
-  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
-  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
 #if defined(__AVX512VL__)
-  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c));
 #else
   return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
                         v128_shr_s64(v256_low_v128(a), c));
@@ -677,11 +681,12 @@ SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
-#define v256_shl_n_8(a, c)                                   \
-  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+#define v256_shl_n_8(a, c)                                \
+  _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \
                    _mm256_slli_epi16(a, c))
-#define v256_shr_n_u8(a, c) \
-  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_u8(a, c)                               \
+  _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \
+                   _mm256_srli_epi16(a, c))
 #define v256_shr_n_s8(a, c)                                                  \
   _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
                      _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
diff --git a/aom_dsp/simd/v64_intrinsics_c.h b/aom_dsp/simd/v64_intrinsics_c.h
index b84f243c4..bfd6fe071 100644
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -186,11 +186,7 @@ SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
   for (c = 0; c < 8; c++)
-    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
-                  ? 255
-                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
-                        ? 0
-                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
+    t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255);
   return t;
 }
 
@@ -198,11 +194,7 @@ SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
   for (c = 0; c < 8; c++)
-    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
-                  ? 127
-                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
-                        ? -128
-                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
+    t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127);
   return t;
 }
 
@@ -210,11 +202,7 @@ SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
   for (c = 0; c < 4; c++)
-    t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
-                   ? 32767
-                   : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
-                         ? -32768
-                         : (int32_t)a.s16[c] + (int32_t)b.s16[c];
+    t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767);
   return t;
 }
 
@@ -244,7 +232,7 @@ SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
   int c;
   for (c = 0; c < 8; c++) {
     int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
-    t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
+    t.s8[c] = SIMD_CLAMP(d, -128, 127);
   }
   return t;
 }
@@ -260,11 +248,7 @@ SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
   for (c = 0; c < 4; c++)
-    t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
-                   ? -32768
-                   : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
-                         ? 32767
-                         : (int32_t)a.s16[c] - (int32_t)b.s16[c];
+    t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767);
   return t;
 }
 
@@ -481,10 +465,10 @@ SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
     a = b;
     b = u;
   }
-  t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
-  t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
-  t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
-  t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
+  t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767);
+  t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767);
+  t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767);
+  t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767);
   return t;
 }
 
@@ -495,10 +479,10 @@ SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
     a = b;
     b = u;
   }
-  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
-  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
-  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
-  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
+  t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535);
+  t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535);
+  t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535);
+  t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535);
   return t;
 }
 
@@ -509,14 +493,14 @@ SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
     a = b;
     b = u;
   }
-  t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
-  t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
-  t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
-  t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
-  t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
-  t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
-  t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
-  t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
+  t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255);
+  t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255);
+  t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255);
+  t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255);
+  t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255);
+  t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255);
+  t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255);
+  t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255);
   return t;
 }
 
@@ -527,14 +511,14 @@ SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
     a = b;
     b = u;
   }
-  t.u8[7] = (uint8_t)(a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]);
-  t.u8[6] = (uint8_t)(a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]);
-  t.u8[5] = (uint8_t)(a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]);
-  t.u8[4] = (uint8_t)(a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]);
-  t.u8[3] = (uint8_t)(b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]);
-  t.u8[2] = (uint8_t)(b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]);
-  t.u8[1] = (uint8_t)(b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]);
-  t.u8[0] = (uint8_t)(b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]);
+  t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127);
+  t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127);
+  t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127);
+  t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127);
+  t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127);
+  t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127);
+  t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127);
+  t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127);
   return t;
 }
 
@@ -702,13 +686,13 @@ SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
   c_v64 t;
   int32_t u;
   u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
-  t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  t.s16[0] = SIMD_CLAMP(u, -32768, 32767);
   u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
-  t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  t.s16[1] = SIMD_CLAMP(u, -32768, 32767);
   u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
-  t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  t.s16[2] = SIMD_CLAMP(u, -32768, 32767);
   u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
-  t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  t.s16[3] = SIMD_CLAMP(u, -32768, 32767);
   return t;
 }
 
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 1f273fe96..ec27a6bf4 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -43,14 +43,14 @@ SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
 }
 
 SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return _mm_set_epi32(0, 0, x, y);
+  return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y);
 }
 
 SIMD_INLINE v64 v64_from_64(uint64_t x) {
 #ifdef __x86_64__
-  return _mm_cvtsi64_si128(x);
+  return _mm_cvtsi64_si128((int64_t)x);
 #else
-  return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+  return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x);
 #endif
 }
 
@@ -101,11 +101,11 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
 
 SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
 
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
 
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
 
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
 
 SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
 
@@ -178,14 +178,11 @@ SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
   __m128i t = _mm_unpacklo_epi64(b, a);
   return _mm_packus_epi32(t, t);
 #else
-  int32_t ah = v64_high_u32(a);
-  int32_t al = v64_low_u32(a);
-  int32_t bh = v64_high_u32(b);
-  int32_t bl = v64_low_u32(b);
-  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
-                     al > 65535 ? 65535 : al < 0 ? 0 : al,
-                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
-                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
+  const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535);
+  const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535);
+  const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535);
+  const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535);
+  return v64_from_16(ah, al, bh, bl);
 #endif
 }
 
@@ -279,7 +276,7 @@ SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
   v64 output;
   unsigned char *input = (unsigned char *)&x;
   unsigned char *index = (unsigned char *)&pattern;
-  char *selected = (char *)&output;
+  unsigned char *selected = (unsigned char *)&output;
   int counter;
 
   for (counter = 0; counter < 8; counter++) {
@@ -433,42 +430,43 @@ SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
   return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
   return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
+      a);
 }
 
 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
@@ -476,9 +474,9 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
 #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
 #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
 #define v64_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v64_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
 #define v64_shr_n_s8(a, c) \
   _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
 #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index d7641607f..f72feea77 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -209,14 +209,41 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
   }
 
-void aom_get_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride, uint32_t *sse,
-                                int *sum) {
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    uint32_t *sse8x8, int *sum8x8,
+                                    unsigned int *tot_sse, int *tot_sum,
+                                    uint32_t *var8x8) {
   // Loop over 4 8x8 blocks. Process one 8x32 block.
   for (int k = 0; k < 4; k++) {
-    variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse[k],
-             &sum[k]);
+    variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
+             &sum8x8[k]);
   }
+
+  // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++)
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+}
+
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      uint32_t *sse16x16, unsigned int *tot_sse,
+                                      int *tot_sum, uint32_t *var16x16) {
+  int sum16x16[64] = { 0 };
+  // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
+             16, 16, &sse16x16[k], &sum16x16[k]);
+  }
+
+  // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++)
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
 }
 
 /* Identical to the variance call except it does not calculate the
@@ -1240,6 +1267,20 @@ uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
   return sum;
 }
 
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
+                              int h) {
+  uint16_t *src_temp = src;
+  uint8_t *dst_temp = dst;
+  const int num_blks = 16 / w;
+  int64_t sum = 0;
+  for (int i = 0; i < num_blks; i++) {
+    sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
+    dst_temp += w;
+    src_temp += (w * h);
+  }
+  return sum;
+}
+
 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
                                     int sstride, int w, int h) {
   uint64_t sum = 0;
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index dae419763..6603d312b 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -90,6 +90,7 @@ typedef struct aom_variance_vtable {
   aom_subpixvariance_fn_t svf;
   aom_subp_avg_variance_fn_t svaf;
   aom_sad_multi_d_fn_t sdx4df;
+  aom_sad_multi_d_fn_t sdx3df;
   // Same as sadx4, but downsample the rows by a factor of 2.
   aom_sad_multi_d_fn_t sdsx4df;
   aom_masked_sad_fn_t msdf;
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6c8db3a02..c85d8c5ed 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -505,50 +505,356 @@ int aom_satd_lp_avx2(const int16_t *coeff, int length) {
   }
 }
 
-static INLINE __m256i calc_avg_8x8_dual_avx2(const uint8_t *s, int p) {
-  const __m256i s0 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s)));
-  const __m256i s1 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + p)));
-  const __m256i s2 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 2 * p)));
-  const __m256i s3 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 3 * p)));
-  const __m256i sum0 =
-      _mm256_add_epi16(_mm256_add_epi16(s0, s1), _mm256_add_epi16(s2, s3));
-  const __m256i s4 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 4 * p)));
-  const __m256i s5 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 5 * p)));
-  const __m256i s6 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 6 * p)));
-  const __m256i s7 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 7 * p)));
-  const __m256i sum1 =
-      _mm256_add_epi16(_mm256_add_epi16(s4, s5), _mm256_add_epi16(s6, s7));
-
-  // The result of two 8x8 sub-blocks in 16x16 block.
-  return _mm256_add_epi16(sum0, sum1);
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
 }
 
 void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
-  // Process 1st and 2nd 8x8 sub-blocks in a 16x16 block.
-  const uint8_t *s_tmp = s + y16_idx * p + x16_idx;
-  __m256i result_0 = calc_avg_8x8_dual_avx2(s_tmp, p);
-
-  // Process 3rd and 4th 8x8 sub-blocks in a 16x16 block.
-  s_tmp = s + ((y16_idx + 8) * p) + x16_idx;
-  __m256i result_1 = calc_avg_8x8_dual_avx2(s_tmp, p);
-
-  const __m256i constant_32 = _mm256_set1_epi16(32);
-  result_0 = _mm256_hadd_epi16(result_0, result_1);
-  result_1 = _mm256_adds_epu16(result_0, _mm256_srli_si256(result_0, 4));
-  result_0 = _mm256_adds_epu16(result_1, _mm256_srli_si256(result_1, 2));
-  result_0 = _mm256_adds_epu16(result_0, constant_32);
-  result_0 = _mm256_srli_epi16(result_0, 6);
-  avg[0] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 0);
-  avg[1] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 0);
-  avg[2] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 4);
-  avg[3] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 4);
+  const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
+  const uint8_t *s_y1 = s_y0 + 8 * p;
+  __m256i sum0, sum1, s0, s1, s2, s3, u0;
+  u0 = _mm256_setzero_si256();
+  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
+  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
+  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+  sum0 = _mm256_add_epi16(s0, s1);
+  sum1 = _mm256_add_epi16(s2, s3);
+  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+  sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
+  sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
+  sum0 = _mm256_add_epi16(sum0, sum1);
+
+  // (avg + 32) >> 6
+  __m256i rounding = _mm256_set1_epi32(32);
+  sum0 = _mm256_add_epi32(sum0, rounding);
+  sum0 = _mm256_srli_epi32(sum0, 6);
+  __m128i lo = _mm256_castsi256_si128(sum0);
+  __m128i hi = _mm256_extracti128_si256(sum0, 1);
+  avg[0] = _mm_cvtsi128_si32(lo);
+  avg[1] = _mm_extract_epi32(lo, 2);
+  avg[2] = _mm_cvtsi128_si32(hi);
+  avg[3] = _mm_extract_epi32(hi, 2);
+}
+
+void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width and height to be multiple of 16 and 2
+  // respectively. For any odd width or height, SIMD support needs to be added.
+  assert(width % 16 == 0 && height % 2 == 0);
+
+  if (width % 32 == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int wd = 0; wd < width; wd += 32) {
+      const uint8_t *ref_tmp = ref + wd;
+      int16_t *hbuf_tmp = hbuf + wd;
+      __m256i s0 = zero;
+      __m256i s1 = zero;
+      int idx = 0;
+      do {
+        __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+        __m256i t0 = _mm256_unpacklo_epi8(src_line, zero);
+        __m256i t1 = _mm256_unpackhi_epi8(src_line, zero);
+        s0 = _mm256_add_epi16(s0, t0);
+        s1 = _mm256_add_epi16(s1, t1);
+        ref_tmp += ref_stride;
+
+        src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+        t0 = _mm256_unpacklo_epi8(src_line, zero);
+        t1 = _mm256_unpackhi_epi8(src_line, zero);
+        s0 = _mm256_add_epi16(s0, t0);
+        s1 = _mm256_add_epi16(s1, t1);
+        ref_tmp += ref_stride;
+        idx += 2;
+      } while (idx < height);
+      s0 = _mm256_srai_epi16(s0, norm_factor);
+      s1 = _mm256_srai_epi16(s1, norm_factor);
+      _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 16),
+                       _mm256_extractf128_si256(s0, 1));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 24),
+                       _mm256_extractf128_si256(s1, 1));
+    }
+  } else if (width % 16 == 0) {
+    aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor);
+  }
+}
+
+static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+                                     const int stride) {
+  src[0] = _mm256_loadu_si256((const __m256i *)ref1);
+  src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride)));
+  src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride)));
+}
+
+#define CALC_TOT_SAD_AND_STORE                                                \
+  /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */                     \
+  const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0);         \
+  /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */             \
+  const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01);       \
+  /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */     \
+  const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012);   \
+                                                                              \
+  const __m128i results0 = _mm_add_epi16(                                     \
+      _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \
+  const __m128i results1 =                                                    \
+      _mm_add_epi16(results0, _mm_srli_si128(results0, 8));                   \
+  _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
+
+static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+                                             const int ref_stride,
+                                             const int height,
+                                             int norm_factor) {
+  const __m256i zero = _mm256_setzero_si256();
+  int ht = 0;
+  // Post sad operation, the data is present in lower 16-bit of each 64-bit lane
+  // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to
+  // utilize the higher 16-bits efficiently.
+  do {
+    __m256i src_00 =
+        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+    src_00 = _mm256_inserti128_si256(
+        src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1);
+    __m256i src_01 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1)));
+    src_01 = _mm256_inserti128_si256(
+        src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1);
+    __m256i src_10 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2)));
+    src_10 = _mm256_inserti128_si256(
+        src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1);
+    __m256i src_11 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3)));
+    src_11 = _mm256_inserti128_si256(
+        src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1);
+
+    // s00 x x x s01 x x x | s40 x x x s41 x x x
+    const __m256i s0 = _mm256_sad_epu8(src_00, zero);
+    // s10 x x x s11 x x x | s50 x x x s51 x x x
+    const __m256i s1 = _mm256_sad_epu8(src_01, zero);
+    // s20 x x x s21 x x x | s60 x x x s61 x x x
+    const __m256i s2 = _mm256_sad_epu8(src_10, zero);
+    // s30 x x x s31 x x x | s70 x x x s71 x x x
+    const __m256i s3 = _mm256_sad_epu8(src_11, zero);
+
+    // s00 s10 x x x x x x | s40 s50 x x x x x x
+    const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1);
+    // s01 s11 x x x x x x | s41 s51 x x x x x x
+    const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1);
+    // s20 s30 x x x x x x | s60 s70 x x x x x x
+    const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3);
+    // s21 s31 x x x x x x | s61 s71 x x x x x x
+    const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3);
+
+    // s0 s1 x x x x x x | s4 s5 x x x x x x
+    const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi);
+    // s2 s3 x x x x x x | s6 s7 x x x x x x
+    const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi);
+
+    // s1 s1 s2 s3 s4 s5 s6 s7
+    const __m128i results = _mm256_castsi256_si128(
+        _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08));
+    _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor));
+    vbuf += 8;
+    ref += (ref_stride << 3);
+    ht += 8;
+  } while (ht < height);
+}
+
+void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  assert(width % 16 == 0);
+  if (width == 128) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[16];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+      load_from_src_buf(ref + 32, &src[4], ref_stride);
+      load_from_src_buf(ref + 64, &src[8], ref_stride);
+      load_from_src_buf(ref + 96, &src[12], ref_stride);
+
+      // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+      const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero),
+                                          _mm256_sad_epu8(src[4], zero));
+      const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero),
+                                          _mm256_sad_epu8(src[12], zero));
+      const __m256i r0 = _mm256_add_epi16(s0, s1);
+      // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+      const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero),
+                                          _mm256_sad_epu8(src[5], zero));
+      const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero),
+                                          _mm256_sad_epu8(src[13], zero));
+      const __m256i r1 = _mm256_add_epi16(s2, s3);
+      // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+      const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero),
+                                          _mm256_sad_epu8(src[6], zero));
+      const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero),
+                                          _mm256_sad_epu8(src[14], zero));
+      const __m256i r2 = _mm256_add_epi16(s4, s5);
+      // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+      const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero),
+                                          _mm256_sad_epu8(src[7], zero));
+      const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero),
+                                          _mm256_sad_epu8(src[15], zero));
+      const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
+    }
+  } else if (width == 64) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[8];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+      load_from_src_buf(ref + 32, &src[4], ref_stride);
+
+      // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+      const __m256i s0 = _mm256_sad_epu8(src[0], zero);
+      const __m256i s1 = _mm256_sad_epu8(src[4], zero);
+      const __m256i r0 = _mm256_add_epi16(s0, s1);
+      // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+      const __m256i s2 = _mm256_sad_epu8(src[1], zero);
+      const __m256i s3 = _mm256_sad_epu8(src[5], zero);
+      const __m256i r1 = _mm256_add_epi16(s2, s3);
+      // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+      const __m256i s4 = _mm256_sad_epu8(src[2], zero);
+      const __m256i s5 = _mm256_sad_epu8(src[6], zero);
+      const __m256i r2 = _mm256_add_epi16(s4, s5);
+      // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+      const __m256i s6 = _mm256_sad_epu8(src[3], zero);
+      const __m256i s7 = _mm256_sad_epu8(src[7], zero);
+      const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
+    }
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[4];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+
+      // s00 x x x s01 x x x s02 x x x s03 x x x
+      const __m256i r0 = _mm256_sad_epu8(src[0], zero);
+      // s10 x x x s11 x x x s12 x x x s13 x x x
+      const __m256i r1 = _mm256_sad_epu8(src[1], zero);
+      // s20 x x x s21 x x x s22 x x x s23 x x x
+      const __m256i r2 = _mm256_sad_epu8(src[2], zero);
+      // s30 x x x s31 x x x s32 x x x s33 x x x
+      const __m256i r3 = _mm256_sad_epu8(src[3], zero);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
+    }
+  } else if (width == 16) {
+    aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor);
+  }
+}
+
+static inline void calc_vector_mean_sse_64wd(const int16_t *ref,
+                                             const int16_t *src, __m256i *mean,
+                                             __m256i *sse) {
+  const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+  const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+  const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+  const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+  const __m256i ref_line2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i ref_line3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+
+  const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+  const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+  const __m256i diff2 = _mm256_sub_epi16(ref_line2, src_line2);
+  const __m256i diff3 = _mm256_sub_epi16(ref_line3, src_line3);
+  const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+  const __m256i diff_sqr2 = _mm256_madd_epi16(diff2, diff2);
+  const __m256i diff_sqr3 = _mm256_madd_epi16(diff3, diff3);
+
+  *mean = _mm256_add_epi16(*mean, _mm256_add_epi16(diff0, diff1));
+  *mean = _mm256_add_epi16(*mean, diff2);
+  *mean = _mm256_add_epi16(*mean, diff3);
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(diff_sqr0, diff_sqr1));
+  *sse = _mm256_add_epi32(*sse, diff_sqr2);
+  *sse = _mm256_add_epi32(*sse, diff_sqr3);
+}
+
+#define CALC_VAR_FROM_MEAN_SSE(mean, sse)                                    \
+  {                                                                          \
+    mean = _mm256_madd_epi16(mean, _mm256_set1_epi16(1));                    \
+    mean = _mm256_hadd_epi32(mean, sse);                                     \
+    mean = _mm256_add_epi32(mean, _mm256_bsrli_epi128(mean, 4));             \
+    const __m128i result = _mm_add_epi32(_mm256_castsi256_si128(mean),       \
+                                         _mm256_extractf128_si256(mean, 1)); \
+    /*(mean * mean): dynamic range 31 bits.*/                                \
+    const int mean_int = _mm_extract_epi32(result, 0);                       \
+    const int sse_int = _mm_extract_epi32(result, 2);                        \
+    const unsigned int mean_abs = abs(mean_int);                             \
+    var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));                    \
+  }
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl) {
+  const int width = 4 << bwl;
+  assert(width % 16 == 0 && width <= 128);
+  int var = 0;
+
+  // Instead of having a loop over width 16, considered loop unrolling to avoid
+  // some addition operations.
+  if (width == 128) {
+    __m256i mean = _mm256_setzero_si256();
+    __m256i sse = _mm256_setzero_si256();
+
+    calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+    calc_vector_mean_sse_64wd(src + 64, ref + 64, &mean, &sse);
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  } else if (width == 64) {
+    __m256i mean = _mm256_setzero_si256();
+    __m256i sse = _mm256_setzero_si256();
+
+    calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  } else if (width == 32) {
+    const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+    const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+
+    const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+    const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+    const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+    const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+    const __m256i sse = _mm256_add_epi32(diff_sqr0, diff_sqr1);
+    __m256i mean = _mm256_add_epi16(diff0, diff1);
+
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  } else if (width == 16) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i ref_line = _mm256_loadu_si256((const __m256i *)ref);
+    __m256i mean = _mm256_sub_epi16(ref_line, src_line);
+    const __m256i sse = _mm256_madd_epi16(mean, mean);
+
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  }
+  return var;
 }
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index bdbd1f6a2..71e702888 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_ports/mem.h"
 
 void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
@@ -95,39 +96,61 @@ void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
 }
 
 unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
-  __m128i s0, s1, u0;
+  __m128i sum0, sum1, s0, s1, s2, s3, u0;
   unsigned int avg = 0;
   u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
+  s0 = loadh_epi64((const __m128i *)(s + p),
+                   _mm_loadl_epi64((const __m128i *)(s)));
+  s1 = loadh_epi64((const __m128i *)(s + 3 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
+  s2 = loadh_epi64((const __m128i *)(s + 5 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
+  s3 = loadh_epi64((const __m128i *)(s + 7 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
+  s0 = _mm_sad_epu8(s0, u0);
+  s1 = _mm_sad_epu8(s1, u0);
+  s2 = _mm_sad_epu8(s2, u0);
+  s3 = _mm_sad_epu8(s3, u0);
+
+  sum0 = _mm_add_epi16(s0, s1);
+  sum1 = _mm_add_epi16(s2, s3);
+  sum0 = _mm_add_epi16(sum0, sum1);
+  sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
+  avg = _mm_cvtsi128_si32(sum0);
   return (avg + 32) >> 6;
 }
 
+void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+  __m128i sum0, sum1, s0, s1, s2, s3, u0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
+  s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
+  s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
+  s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
+  sum0 = _mm_add_epi16(s0, s1);
+  sum1 = _mm_add_epi16(s2, s3);
+  s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
+  s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
+  s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
+  s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
+  sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
+  sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
+  sum0 = _mm_add_epi16(sum0, sum1);
+
+  // (avg + 32) >> 6
+  __m128i rounding = _mm_set1_epi32(32);
+  sum0 = _mm_add_epi32(sum0, rounding);
+  sum0 = _mm_srli_epi32(sum0, 6);
+  avg[0] = _mm_cvtsi128_si32(sum0);
+  avg[1] = _mm_extract_epi16(sum0, 4);
+}
+
 void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
-  for (int k = 0; k < 4; k++) {
-    const int x8_idx = x16_idx + ((k & 1) << 3);
-    const int y8_idx = y16_idx + ((k >> 1) << 3);
-    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
-    avg[k] = aom_avg_8x8_sse2(s_tmp, p);
+  const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
+  for (int k = 0; k < 2; k++) {
+    calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
+    s_ptr += 8 * p;
   }
 }
 
@@ -135,17 +158,14 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
+  s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
+                          _mm_cvtsi32_si128(*(const int *)(s + p)));
+  s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
+                          _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+  s0 = _mm_sad_epu8(s0, u0);
+  s1 = _mm_sad_epu8(s1, u0);
+  s0 = _mm_add_epi16(s0, s1);
+  avg = _mm_cvtsi128_si32(s0);
   return (avg + 8) >> 4;
 }
 
@@ -497,11 +517,11 @@ static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
     coeff3 = _mm_sub_epi16(b1, b3);
 
     if (is_final) {
-      store_tran_low(coeff0, coeff);
-      store_tran_low(coeff1, coeff + 64);
-      store_tran_low(coeff2, coeff + 128);
-      store_tran_low(coeff3, coeff + 192);
-      coeff += 8;
+      store_tran_low_offset_4(coeff0, coeff);
+      store_tran_low_offset_4(coeff1, coeff + 64);
+      store_tran_low_offset_4(coeff2, coeff + 128);
+      store_tran_low_offset_4(coeff3, coeff + 192);
+      coeff += 4;
     } else {
       _mm_store_si128((__m128i *)coeff16, coeff0);
       _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
@@ -511,6 +531,10 @@ static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
     }
 
     t_coeff += 8;
+    // Increment the pointer additionally by 0 and 8 in alternate
+    // iterations(instead of 8) to ensure the coherency with the implementation
+    // of store_tran_low_offset_4()
+    coeff += (((idx >> 3) & 1) << 3);
   }
 }
 
@@ -553,15 +577,18 @@ void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
 
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
-    store_tran_low(coeff0, coeff);
-    store_tran_low(coeff1, coeff + 256);
+    store_tran_low_offset_4(coeff0, coeff);
+    store_tran_low_offset_4(coeff1, coeff + 256);
 
     coeff2 = _mm_sub_epi16(b0, b2);
     coeff3 = _mm_sub_epi16(b1, b3);
-    store_tran_low(coeff2, coeff + 512);
-    store_tran_low(coeff3, coeff + 768);
+    store_tran_low_offset_4(coeff2, coeff + 512);
+    store_tran_low_offset_4(coeff3, coeff + 768);
 
-    coeff += 8;
+    // Increment the pointer by 4 and 12 in alternate iterations(instead of 8)
+    // to ensure the coherency with the implementation of
+    // store_tran_low_offset_4()
+    coeff += (4 + (((idx >> 3) & 1) << 3));
     t_coeff += 8;
   }
 }
@@ -625,74 +652,64 @@ int aom_satd_lp_sse2(const int16_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void aom_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
-                          const int ref_stride, const int height) {
-  int idx = 1;
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width and height to be multiple of 16 and 2
+  // respectively. For any odd width or height, SIMD support needs to be added.
+  assert(width % 16 == 0 && height % 2 == 0);
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
-  __m128i t0, t1;
-  int height_1 = height - 1;
-  ref += ref_stride;
-  do {
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-    idx += 2;
-  } while (idx < height_1);
-
-  src_line = _mm_loadu_si128((const __m128i *)ref);
-  t0 = _mm_unpacklo_epi8(src_line, zero);
-  t1 = _mm_unpackhi_epi8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, t0);
-  s1 = _mm_adds_epu16(s1, t1);
-  if (height == 128) {
-    s0 = _mm_srai_epi16(s0, 6);
-    s1 = _mm_srai_epi16(s1, 6);
-  } else if (height == 64) {
-    s0 = _mm_srai_epi16(s0, 5);
-    s1 = _mm_srai_epi16(s1, 5);
-  } else if (height == 32) {
-    s0 = _mm_srai_epi16(s0, 4);
-    s1 = _mm_srai_epi16(s1, 4);
-  } else {
-    assert(height == 16);
-    s0 = _mm_srai_epi16(s0, 3);
-    s1 = _mm_srai_epi16(s1, 3);
-  }
 
-  _mm_storeu_si128((__m128i *)hbuf, s0);
-  hbuf += 8;
-  _mm_storeu_si128((__m128i *)hbuf, s1);
+  for (int wd = 0; wd < width; wd += 16) {
+    const uint8_t *ref_tmp = ref + wd;
+    int16_t *hbuf_tmp = hbuf + wd;
+    __m128i s0 = zero;
+    __m128i s1 = zero;
+    int idx = 0;
+    do {
+      __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
+      __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
+      s0 = _mm_add_epi16(s0, t0);
+      s1 = _mm_add_epi16(s1, t1);
+      ref_tmp += ref_stride;
+
+      src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      t0 = _mm_unpacklo_epi8(src_line, zero);
+      t1 = _mm_unpackhi_epi8(src_line, zero);
+      s0 = _mm_add_epi16(s0, t0);
+      s1 = _mm_add_epi16(s1, t1);
+      ref_tmp += ref_stride;
+      idx += 2;
+    } while (idx < height);
+
+    s0 = _mm_srai_epi16(s0, norm_factor);
+    s1 = _mm_srai_epi16(s1, norm_factor);
+    _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
+    _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
+  }
 }
 
-int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_sad_epu8(src_line, zero);
-  __m128i s1;
-  int i;
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width to be multiple of 16.
+  assert(width % 16 == 0);
+
+  for (int ht = 0; ht < height; ht++) {
+    const uint8_t *ref_tmp = ref + (ht * ref_stride);
+    __m128i zero = _mm_setzero_si128();
+    __m128i s0 = zero;
+    __m128i s1, src_line;
+    for (int i = 0; i < width; i += 16) {
+      src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      s1 = _mm_sad_epu8(src_line, zero);
+      s0 = _mm_add_epi16(s0, s1);
+      ref_tmp += 16;
+    }
 
-  for (i = 16; i < width; i += 16) {
-    ref += 16;
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    s1 = _mm_sad_epu8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, s1);
+    s1 = _mm_srli_si128(s0, 8);
+    s0 = _mm_add_epi16(s0, s1);
+    vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor;
   }
-
-  s1 = _mm_srli_si128(s0, 8);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  return _mm_extract_epi16(s0, 0);
 }
diff --git a/aom_dsp/x86/avg_intrin_sse4.c b/aom_dsp/x86/avg_intrin_sse4.c
new file mode 100644
index 000000000..b83b43122
--- /dev/null
+++ b/aom_dsp/x86/avg_intrin_sse4.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) {
+  const int width = 4 << bwl;
+  assert(width % 16 == 0);
+
+  const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+  __m128i mean = _mm_setzero_si128();
+  __m128i sse = _mm_setzero_si128();
+
+  for (int i = 0; i < width; i += 16) {
+    const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
+    const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
+    const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
+    __m128i diff = _mm_sub_epi16(ref_line, src_line);
+    const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
+    __m128i diff_sqr = _mm_madd_epi16(diff, diff);
+    const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
+
+    diff = _mm_add_epi16(diff, diff2);
+    diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
+    sse = _mm_add_epi32(sse, diff_sqr);
+    mean = _mm_add_epi16(mean, diff);
+
+    src += 16;
+    ref += 16;
+  }
+
+  // m0 m1 m2 m3
+  mean = _mm_madd_epi16(mean, k_one_epi16);
+  // m0+m1 m2+m3 s0+s1 s2+s3
+  __m128i result = _mm_hadd_epi32(mean, sse);
+  // m0+m1+m2+m3 s0+s1+s2+s3 x x
+  result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4));
+
+  // (mean * mean): dynamic range 31 bits.
+  const int mean_int = _mm_extract_epi32(result, 0);
+  const int sse_int = _mm_extract_epi32(result, 2);
+  const unsigned int mean_abs = abs(mean_int);
+  const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));
+  return var;
+}
diff --git a/aom_dsp/x86/bitdepth_conversion_sse2.h b/aom_dsp/x86/bitdepth_conversion_sse2.h
index 42bb2d1d3..ff77760b6 100644
--- a/aom_dsp/x86/bitdepth_conversion_sse2.h
+++ b/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -22,14 +22,28 @@ static INLINE __m128i load_tran_low(const tran_low_t *a) {
   return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
 }
 
-// Store 8 16 bit values. If the destination is 32 bits then sign extend the
-// values by multiplying by 1.
-static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
   const __m128i one = _mm_set1_epi16(1);
   const __m128i a_hi = _mm_mulhi_epi16(a, one);
   const __m128i a_lo = _mm_mullo_epi16(a, one);
-  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
-  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  *a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  *a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+  __m128i a_1, a_2;
+  unpack_trans(a, &a_1, &a_2);
   _mm_store_si128((__m128i *)(b), a_1);
   _mm_store_si128((__m128i *)(b + 4), a_2);
 }
+// Stores the second result at an offset of 8 (instead of 4) to match the output
+// with that of AVX2 implementation and the function is similar to
+// store_tran_low().
+static INLINE void store_tran_low_offset_4(__m128i a, tran_low_t *b) {
+  __m128i a_1, a_2;
+  unpack_trans(a, &a_1, &a_2);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 8), a_2);
+}
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 95383d2fd..dfbab324d 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -910,14 +910,14 @@ static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
     const __m256i *round_offset, int shift, const __m256i *clip_low,
     const __m256i *clip_high, const __m256i *mask_max) {
   // Load 4x u16 pixels from each of 4 rows from each source
-  const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride),
-                                       *(uint64_t *)(src0 + 2 * src0_stride),
-                                       *(uint64_t *)(src0 + 1 * src0_stride),
-                                       *(uint64_t *)(src0 + 0 * src0_stride));
-  const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride),
-                                       *(uint64_t *)(src1 + 2 * src1_stride),
-                                       *(uint64_t *)(src1 + 1 * src1_stride),
-                                       *(uint64_t *)(src1 + 0 * src1_stride));
+  const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
+                                       *(int64_t *)(src0 + 2 * src0_stride),
+                                       *(int64_t *)(src0 + 1 * src0_stride),
+                                       *(int64_t *)(src0 + 0 * src0_stride));
+  const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
+                                       *(int64_t *)(src1 + 2 * src1_stride),
+                                       *(int64_t *)(src1 + 1 * src1_stride),
+                                       *(int64_t *)(src1 + 0 * src1_stride));
   // Generate the inverse mask
   const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
 
@@ -964,10 +964,10 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
     const __m256i *clip_high, const __m256i *mask_max) {
   do {
     // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
-    const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride),
-                                         *(uint32_t *)(mask + 2 * mask_stride),
-                                         *(uint32_t *)(mask + 1 * mask_stride),
-                                         *(uint32_t *)(mask + 0 * mask_stride));
+    const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride),
+                                         *(int32_t *)(mask + 2 * mask_stride),
+                                         *(int32_t *)(mask + 1 * mask_stride),
+                                         *(int32_t *)(mask + 0 * mask_stride));
     const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
 
     highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
@@ -994,15 +994,15 @@ static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
     // (saturating) add together rows then use madd to add adjacent pixels
     // Finally, divide each value by 4 (with rounding)
     const __m256i m0246 =
-        _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride),
-                          *(uint64_t *)(mask + 4 * mask_stride),
-                          *(uint64_t *)(mask + 2 * mask_stride),
-                          *(uint64_t *)(mask + 0 * mask_stride));
+        _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
+                          *(int64_t *)(mask + 4 * mask_stride),
+                          *(int64_t *)(mask + 2 * mask_stride),
+                          *(int64_t *)(mask + 0 * mask_stride));
     const __m256i m1357 =
-        _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride),
-                          *(uint64_t *)(mask + 5 * mask_stride),
-                          *(uint64_t *)(mask + 3 * mask_stride),
-                          *(uint64_t *)(mask + 1 * mask_stride));
+        _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
+                          *(int64_t *)(mask + 5 * mask_stride),
+                          *(int64_t *)(mask + 3 * mask_stride),
+                          *(int64_t *)(mask + 1 * mask_stride));
     const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
     const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
     const __m256i mask0 =
@@ -1101,10 +1101,10 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
   do {
     // Load 8x u8 pixels from each of 4 rows in the mask
     const __m128i mask0a8 =
-        _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
+        _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
     const __m128i mask0b8 =
-        _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
-                       *(uint64_t *)(mask + 3 * mask_stride));
+        _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
+                       *(int64_t *)(mask + 3 * mask_stride));
     const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
     const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
 
@@ -1307,7 +1307,7 @@ void aom_highbd_blend_a64_d16_mask_avx2(
   const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
   const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
 
-  const __m256i clip_low = _mm256_set1_epi16(0);
+  const __m256i clip_low = _mm256_setzero_si256();
   const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
   const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 4a368ef94..58a7345ec 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1121,13 +1121,13 @@ static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
     const __m128i *mask_max) {
   // Load 4 pixels from each of 4 rows from each source
   const __m128i s0a =
-      _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
-  const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
-                                     *(uint64_t *)(src0 + 3 * src0_stride));
+      _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride));
+  const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride),
+                                     *(int64_t *)(src0 + 3 * src0_stride));
   const __m128i s1a =
-      _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
-  const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
-                                     *(uint64_t *)(src1 + 3 * src1_stride));
+      _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride));
+  const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride),
+                                     *(int64_t *)(src1 + 3 * src1_stride));
 
   // Generate the inverse masks
   const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
@@ -1187,11 +1187,11 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
     const __m128i *round_offset, int shift, const __m128i *clip_low,
     const __m128i *clip_high, const __m128i *mask_max) {
   do {
-    const __m128i mask0a8 = _mm_set_epi32(0, 0, *(uint32_t *)mask,
-                                          *(uint32_t *)(mask + mask_stride));
+    const __m128i mask0a8 =
+        _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
     const __m128i mask0b8 =
-        _mm_set_epi32(0, 0, *(uint32_t *)(mask + 2 * mask_stride),
-                      *(uint32_t *)(mask + 3 * mask_stride));
+        _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
+                      *(int32_t *)(mask + 3 * mask_stride));
     const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
     const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
 
@@ -1218,16 +1218,16 @@ static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
     // Load 8 pixels from each of 8 rows of mask,
     // (saturating) add together rows then use madd to add adjacent pixels
     // Finally, divide each value by 4 (with rounding)
-    const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
-                                       *(uint64_t *)(mask + 2 * mask_stride));
-    const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
-                                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
+                                       *(int64_t *)(mask + 2 * mask_stride));
+    const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
+                                       *(int64_t *)(mask + 3 * mask_stride));
     const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
     const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
-    const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
-                                       *(uint64_t *)(mask + 6 * mask_stride));
-    const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
-                                       *(uint64_t *)(mask + 7 * mask_stride));
+    const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
+                                       *(int64_t *)(mask + 6 * mask_stride));
+    const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
+                                       *(int64_t *)(mask + 7 * mask_stride));
     const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
     const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
 
@@ -1493,7 +1493,7 @@ void aom_highbd_blend_a64_d16_mask_sse4_1(
   const __m128i v_round_offset = _mm_set1_epi32(round_offset);
   const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
 
-  const __m128i clip_low = _mm_set1_epi16(0);
+  const __m128i clip_low = _mm_setzero_si128();
   const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
   const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index a00ede2de..a7090088f 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -12,6 +12,13 @@
 #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
 #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
 
+#include <immintrin.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
 // filters for 16
 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
   0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index ab3cd9155..78ea98522 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -34,7 +34,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
 static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
                                           const __m128i *preg1) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
                               _mm_cmpeq_epi16(*preg0, min_overflow));
   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
@@ -48,7 +48,7 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
                                           const __m128i *preg2,
                                           const __m128i *preg3) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
                               _mm_cmpeq_epi16(*preg0, min_overflow));
   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c
index fdf9524ad..8361e2f9e 100644
--- a/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -165,9 +165,9 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
           res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
           res_a_round = _mm256_max_epi16(res_a_round, zero);
 
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+          xx_storel_32(&dst[i * dst_stride + j],
                        _mm256_castsi256_si128(res_a_round));
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
                        _mm256_extracti128_si256(res_a_round, 1));
         }
 
@@ -275,9 +275,8 @@ void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
         _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
                          _mm256_extracti128_si256(res, 1));
       } else {
-        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
-                     _mm256_castsi256_si128(res));
-        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+        xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
+        xx_storel_32(&dst[i * dst_stride + j + dst_stride],
                      _mm256_extracti128_si256(res, 1));
       }
     }
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 1764a4952..a5c450a1b 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -80,7 +80,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
         const uint32_t abs_qcoeff =
             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        qcoeff_ptr[k] =
+            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
@@ -140,7 +141,7 @@ void aom_highbd_quantize_b_32x32_sse2(
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
@@ -198,7 +199,7 @@ void aom_highbd_quantize_b_64x64_sse2(
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index ad4db2f8c..e11754e59 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -551,7 +551,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
-  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
+  const __m256i mask = yy_set1_64_from_32i(~0);
   __m128i sad;
 
   // 8 32-bit summation
@@ -602,71 +602,34 @@ static void init_sad(__m256i *s) {
   s[3] = _mm256_setzero_si256();
 }
 
-static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
+    int M, int N, int D, const uint8_t *src, int src_stride,
+    const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i, j;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    for (j = 0; j < N; j += 4) {
-      sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-      srcp += src_stride << shift_for_4_rows;
-      refp[i] += ref_stride << shift_for_4_rows;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
+  const int shift_for_rows = (M < 128) + (M < 64);
+  const int row_units = 1 << shift_for_rows;
   int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
-  for (i = 0; i < 4; ++i) {
+  for (i = 0; i < D; ++i) {
     srcp = keep;
-    for (r = 0; r < N; r += 4) {
-      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-      srcp += src_stride << shift_for_4_rows;
-      refp[i] += ref_stride << shift_for_4_rows;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_rows = 1;
-  int i, r;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    for (r = 0; r < N; r += 2) {
-      sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+    for (r = 0; r < N; r += row_units) {
+      if (M == 128) {
+        sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+      } else if (M == 64) {
+        sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+      } else if (M == 32) {
+        sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      } else if (M == 16) {
+        sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      } else {
+        assert(0);
+      }
       srcp += src_stride << shift_for_rows;
       refp[i] += ref_stride << shift_for_rows;
     }
@@ -674,47 +637,31 @@ static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  int i, r;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    for (r = 0; r < N; r++) {
-      sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
-      srcp += src_stride;
-      refp[i] += ref_stride;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-#define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                         \
-  void aom_highbd_sad##m##x##n##x4d_avx2(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
-                                  sad_array);                                \
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x4d_avx2(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],   \
+      int ref_stride, uint32_t *sad_array) {                                  \
+    aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
+                             sad_array);                                      \
   }
 #define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                   \
   void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                             \
       const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
       int ref_stride, uint32_t *sad_array) {                                \
-    aom_highbd_sad##m##xNx4d_avx2((n / 2), src, 2 * src_stride, ref_array,  \
-                                  2 * ref_stride, sad_array);               \
+    aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
+                             2 * ref_stride, sad_array);                    \
     sad_array[0] <<= 1;                                                     \
     sad_array[1] <<= 1;                                                     \
     sad_array[2] <<= 1;                                                     \
     sad_array[3] <<= 1;                                                     \
   }
+#define HIGHBD_SAD_MXNX3D_AVX2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x3d_avx2(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],   \
+      int ref_stride, uint32_t *sad_array) {                                  \
+    aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
+                             sad_array);                                      \
+  }
 
 HIGHBD_SAD_MXNX4D_AVX2(16, 4)
 HIGHBD_SAD_MXNX4D_AVX2(16, 8)
@@ -752,3 +699,22 @@ HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
 
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 8)
+HIGHBD_SAD_MXNX3D_AVX2(16, 16)
+HIGHBD_SAD_MXNX3D_AVX2(16, 32)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(32, 16)
+HIGHBD_SAD_MXNX3D_AVX2(32, 32)
+HIGHBD_SAD_MXNX3D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+HIGHBD_SAD_MXNX3D_AVX2(64, 32)
+HIGHBD_SAD_MXNX3D_AVX2(64, 64)
+HIGHBD_SAD_MXNX3D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(128, 64)
+HIGHBD_SAD_MXNX3D_AVX2(128, 128)
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 49912ac19..36e647383 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -26,13 +26,13 @@ static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
     const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
     int dst_stride, uint32_t *sse) {
   const __m256i filter1 =
-      _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[xoffset][1] << 16) |
+      _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) |
                         bilinear_filters_2t[xoffset][0]);
   const __m256i filter2 =
-      _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[yoffset][1] << 16) |
+      _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) |
                         bilinear_filters_2t[yoffset][0]);
   const __m256i one = _mm256_set1_epi16(1);
-  const uint32_t bitshift = (uint32_t)0x40;
+  const int bitshift = 0x40;
   (void)pixel_step;
   unsigned int i, j, prev = 0, curr = 2;
   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 6bd6a5a3f..d45885caf 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -629,13 +629,12 @@ void aom_highbd_dist_wtd_comp_avg_pred_sse2(
     const uint8_t *ref8, int ref_stride,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+  const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set1_epi16(wt0);
+  const __m128i w1 = _mm_set1_epi16(wt1);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index b5f71441f..621ef7af7 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -361,7 +361,7 @@ void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_32xh(&row, 32, dst, stride);
 }
 
@@ -628,7 +628,7 @@ void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_32xh(&row, 16, dst, stride);
 }
 
@@ -637,7 +637,7 @@ void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_32xh(&row, 64, dst, stride);
 }
 
@@ -646,7 +646,7 @@ void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_64xh(&row, 64, dst, stride);
 }
 
@@ -655,7 +655,7 @@ void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_64xh(&row, 32, dst, stride);
 }
 
@@ -664,7 +664,7 @@ void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_64xh(&row, 16, dst, stride);
 }
 
@@ -754,7 +754,7 @@ void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   __m128i x = _mm_loadl_epi64((const __m128i *)left);
   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
@@ -778,7 +778,7 @@ static INLINE __m256i get_left_vector(const uint8_t *left) {
 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   const __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
@@ -797,7 +797,7 @@ void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
@@ -826,7 +826,7 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -864,7 +864,7 @@ void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i l = get_left_vector(left);
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
@@ -886,7 +886,7 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
   __m256i l = get_left_vector(left);
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
@@ -924,7 +924,7 @@ void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i, j;
@@ -952,7 +952,7 @@ void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i t2 = get_top_vector(above + 32);
   const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i, j;
@@ -984,7 +984,7 @@ void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i t2 = get_top_vector(above + 32);
   const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i, j;
@@ -1016,7 +1016,7 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i t2 = get_top_vector(above + 32);
   const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i;
@@ -3537,7 +3537,7 @@ static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
   __m128i a_mbase_x;
 
   a16 = _mm256_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
   c3f = _mm256_set1_epi16(0x3f);
 
   int x = dx;
@@ -3640,7 +3640,7 @@ static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
   __m256i a_mbase_x, diff, c3f;
 
   a16 = _mm256_set1_epi16(16);
-  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
   c3f = _mm256_set1_epi16(0x3f);
 
   int x = dx;
@@ -3722,7 +3722,7 @@ static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   __m128i max_base_x128, base_inc128, mask128;
 
   a16 = _mm256_set1_epi16(16);
-  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
   max_base_x128 = _mm_set1_epi8(max_base_x);
   c3f = _mm256_set1_epi16(0x3f);
 
@@ -3766,14 +3766,14 @@ static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 
         base_inc128 =
-            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
-                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
-                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
-                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
-                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
-                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
-                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
-                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
 
         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
                                  _mm_setzero_si128());
@@ -4092,7 +4092,7 @@ static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
     __m128i resx, resy;
     __m128i resxy;
     int y = r + 1;
-    ydx = _mm256_set1_epi16((uint16_t)(y * dx));
+    ydx = _mm256_set1_epi16((int16_t)(y * dx));
 
     int base_x = (-y * dx) >> frac_bits_x;
     for (int j = 0; j < W; j += 16) {
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 4786696ee..61e29731c 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -146,7 +146,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
   sum += 6;
   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
   dc_store_8xh(&row, 4, dst, stride);
 }
 
@@ -313,7 +313,7 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_above = _mm_add_epi16(sum_above, two);
   sum_above = _mm_srai_epi16(sum_above, 2);
   sum_above = _mm_shufflelo_epi16(sum_above, 0);
@@ -327,7 +327,7 @@ void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_above = _mm_add_epi16(sum_above, two);
   sum_above = _mm_srai_epi16(sum_above, 2);
   sum_above = _mm_shufflelo_epi16(sum_above, 0);
@@ -341,7 +341,7 @@ void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_above = _mm_add_epi16(sum_above, four);
   sum_above = _mm_srai_epi16(sum_above, 3);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -353,7 +353,7 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_above = _mm_add_epi16(sum_above, four);
   sum_above = _mm_srai_epi16(sum_above, 3);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -365,7 +365,7 @@ void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_above = _mm_add_epi16(sum_above, four);
   sum_above = _mm_srai_epi16(sum_above, 3);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -377,7 +377,7 @@ void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -390,7 +390,7 @@ void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -404,7 +404,7 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -418,7 +418,7 @@ void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -431,7 +431,7 @@ void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -445,7 +445,7 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -459,7 +459,7 @@ void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -473,7 +473,7 @@ void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_above = _mm_add_epi16(sum_above, thirtytwo);
   sum_above = _mm_srai_epi16(sum_above, 6);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -487,7 +487,7 @@ void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_above = _mm_add_epi16(sum_above, thirtytwo);
   sum_above = _mm_srai_epi16(sum_above, 6);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -501,7 +501,7 @@ void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_above = _mm_add_epi16(sum_above, thirtytwo);
   sum_above = _mm_srai_epi16(sum_above, 6);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -517,7 +517,7 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_left = _mm_add_epi16(sum_left, four);
   sum_left = _mm_srai_epi16(sum_left, 3);
   sum_left = _mm_shufflelo_epi16(sum_left, 0);
@@ -532,7 +532,7 @@ void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_shufflelo_epi16(sum_left, 0);
@@ -546,7 +546,7 @@ void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_left = _mm_add_epi16(sum_left, two);
   sum_left = _mm_srai_epi16(sum_left, 2);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -559,7 +559,7 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -572,7 +572,7 @@ void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -585,7 +585,7 @@ void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_left = _mm_add_epi16(sum_left, two);
   sum_left = _mm_srai_epi16(sum_left, 2);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -599,7 +599,7 @@ void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_left = _mm_add_epi16(sum_left, four);
   sum_left = _mm_srai_epi16(sum_left, 3);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -613,7 +613,7 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -627,7 +627,7 @@ void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_left = _mm_add_epi16(sum_left, thirtytwo);
   sum_left = _mm_srai_epi16(sum_left, 6);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -641,7 +641,7 @@ void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_left = _mm_add_epi16(sum_left, four);
   sum_left = _mm_srai_epi16(sum_left, 3);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -655,7 +655,7 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -669,7 +669,7 @@ void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_left = _mm_add_epi16(sum_left, thirtytwo);
   sum_left = _mm_srai_epi16(sum_left, 6);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -683,7 +683,7 @@ void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_left = _mm_add_epi16(sum_left, thirtytwo);
   sum_left = _mm_srai_epi16(sum_left, 6);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -697,7 +697,7 @@ void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -711,7 +711,7 @@ void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -743,7 +743,7 @@ void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_8xh(&row, 4, dst, stride);
 }
 
@@ -751,7 +751,7 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_8xh(&row, 16, dst, stride);
 }
 
@@ -759,7 +759,7 @@ void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_8xh(&row, 32, dst, stride);
 }
 
@@ -767,7 +767,7 @@ void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 4, dst, stride);
 }
 
@@ -775,7 +775,7 @@ void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 8, dst, stride);
 }
 
@@ -784,7 +784,7 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 32, dst, stride);
 }
 
@@ -793,7 +793,7 @@ void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 64, dst, stride);
 }
 
@@ -801,7 +801,7 @@ void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_32xh(&row, 8, dst, stride);
 }
 
@@ -810,7 +810,7 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_32xh(&row, 16, dst, stride);
 }
 
@@ -819,7 +819,7 @@ void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_32xh(&row, 64, dst, stride);
 }
 
@@ -828,7 +828,7 @@ void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_64xh(&row, 64, dst, stride);
 }
 
@@ -837,7 +837,7 @@ void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_64xh(&row, 32, dst, stride);
 }
 
@@ -846,7 +846,7 @@ void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_64xh(&row, 16, dst, stride);
 }
 
@@ -1334,7 +1334,7 @@ static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *left, int height) {
   int i = height >> 2;
   do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
     left4 = _mm_unpacklo_epi8(left4, left4);
     left4 = _mm_unpacklo_epi8(left4, left4);
     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
@@ -1364,7 +1364,7 @@ static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *left, int height) {
   int i = height >> 2;
   do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
     left4 = _mm_unpacklo_epi8(left4, left4);
     left4 = _mm_unpacklo_epi8(left4, left4);
     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
diff --git a/aom_dsp/x86/intrapred_sse4.c b/aom_dsp/x86/intrapred_sse4.c
index 21fb1bb8d..3f72dc485 100644
--- a/aom_dsp/x86/intrapred_sse4.c
+++ b/aom_dsp/x86/intrapred_sse4.c
@@ -141,7 +141,7 @@ static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
   __m128i a_mbase_x;
 
   a16 = _mm_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
   c3f = _mm_set1_epi16(0x3f);
 
   int x = dx;
@@ -255,7 +255,7 @@ static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
   __m128i a_mbase_x, diff, c3f;
 
   a16 = _mm_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
   c3f = _mm_set1_epi16(0x3f);
 
   int x = dx;
@@ -353,7 +353,7 @@ static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
   __m128i max_base, base_inc, mask;
 
   a16 = _mm_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
   max_base = _mm_set1_epi8(max_base_x);
   c3f = _mm_set1_epi16(0x3f);
 
@@ -412,14 +412,14 @@ static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
         res = _mm_packus_epi16(res, res1);  // 16 8bit values
 
         base_inc =
-            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
-                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
-                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
-                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
-                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
-                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
-                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
-                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
 
         mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
                               _mm_setzero_si128());
@@ -743,7 +743,7 @@ static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
     __m128i resx, resy;
     __m128i resxy;
     int y = r + 1;
-    ydx = _mm_set1_epi16((uint16_t)(y * dx));
+    ydx = _mm_set1_epi16((int16_t)(y * dx));
 
     int base_x = (-y * dx) >> frac_bits_x;
     for (int j = 0; j < W; j += 16) {
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index a75616e1e..fd48260c6 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -47,7 +47,7 @@ void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -68,7 +68,7 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -86,10 +86,10 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -109,7 +109,7 @@ void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -130,7 +130,7 @@ void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -151,7 +151,7 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -171,7 +171,7 @@ void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int j = 0; j < 2; ++j) {
@@ -199,12 +199,12 @@ static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
 
 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
   const __m128i t = _mm_load_si128((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -225,7 +225,7 @@ void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -248,7 +248,7 @@ void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -271,7 +271,7 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
@@ -305,7 +305,7 @@ void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int j = 0; j < 4; ++j) {
@@ -331,7 +331,7 @@ void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -360,7 +360,7 @@ void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
@@ -390,7 +390,7 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
@@ -433,7 +433,7 @@ void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -471,7 +471,7 @@ void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i dl = _mm_unpacklo_epi8(d, zero);
   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -513,7 +513,7 @@ void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i dl = _mm_unpacklo_epi8(d, zero);
   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -555,7 +555,7 @@ void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i dl = _mm_unpacklo_epi8(d, zero);
   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -586,17 +586,17 @@ void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 // pixels[2]: right_pred vector
 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
   if (height == 4)
-    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
   else if (height == 8)
     pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
   else
     pixels[1] = _mm_loadu_si128(((const __m128i *)left));
 
-  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
+  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
 
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
@@ -610,8 +610,8 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
 static INLINE void load_weight_w4(int height, __m128i *weight_h,
                                   __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
   weight_h[0] = _mm_unpacklo_epi8(t, zero);
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
@@ -711,16 +711,16 @@ void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
   __m128i d = _mm_loadl_epi64((const __m128i *)above);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
   pixels[1] = _mm_unpackhi_epi16(d, bp);
 
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
 
   if (height == 4) {
-    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
   } else if (height == 8) {
     pixels[2] = _mm_loadl_epi64((const __m128i *)left);
   } else if (height == 16) {
@@ -750,7 +750,7 @@ static INLINE void load_weight_w8(int height, __m128i *weight_h,
   const int we_offset = height < 8 ? 0 : 4;
   __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
   weight_h[0] = _mm_unpacklo_epi8(we, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 
   if (height == 4) {
@@ -883,62 +883,116 @@ void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
 }
 
-static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left, uint32_t bw,
-                                        uint32_t bh) {
-  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
-  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i top_right =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round =
-      _mm_set1_epi32((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+// TODO(slavarnway): Visual Studio only supports restrict when /std:c11
+// (available in 2019+) or greater is specified; __restrict can be used in that
+// case. This should be moved to rtcd and used consistently between the
+// function declarations and definitions to avoid warnings in Visual Studio
+// when defining LIBAOM_RESTRICT to restrict or __restrict.
+#if defined(_MSC_VER)
+#define LIBAOM_RESTRICT
+#else
+#define LIBAOM_RESTRICT restrict
+#endif
+
+static AOM_FORCE_INLINE __m128i Load4(const void *src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
+  return _mm_loadl_epi64((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
+  return _mm_loadu_si128((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
+  _mm_storel_epi64((__m128i *)(a), v);
+}
 
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
+  _mm_storeu_si128((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
+  return _mm_unpacklo_epi8((x), _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
+  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
+  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
+  return _mm_unpacklo_epi16((x), _mm_setzero_si128());
+}
+
+void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+                          const uint8_t *LIBAOM_RESTRICT top_row,
+                          const uint8_t *LIBAOM_RESTRICT left_column, int width,
+                          int height) {
+  const uint8_t *const sm_weights_h = smooth_weights + height - 4;
+  const uint8_t *const sm_weights_w = smooth_weights + width - 4;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
+  const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
+  const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  for (int y = 0; y < height; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
-    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
-    const __m128i wl_y =
+    __m128i scaled_bottom_left =
+        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i weight_left_y =
         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
-    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
-    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
-      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
-      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
-
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      const __m128i scale_m_weights_x =
-          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
-      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
-      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
-      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
-      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
-
-      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
-      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
-
-      pred_lo = _mm_srai_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
-      pred_hi = _mm_srai_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+    for (int x = 0; x < width; x += 8) {
+      const __m128i top_x = LoadLo8(top_row + x);
+      const __m128i weights_x = LoadLo8(sm_weights_w + x);
+      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
+      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+      // Here opposite weights and pixels are multiplied, where the order of
+      // interleaving is indicated in the names.
+      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+      // |scaled_bottom_left| is always scaled by the same weight each row, so
+      // we only derive |scaled_top_right| values here.
+      const __m128i inverted_weights_x =
+          _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
+      const __m128i scaled_top_right =
+          _mm_mullo_epi16(inverted_weights_x, top_right);
+      const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
+      const __m128i scaled_top_right_hi =
+          _mm_unpackhi_epi16(scaled_top_right, zero);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+      // The round value for RightShiftWithRounding was added with
+      // |scaled_bottom_left|.
+      pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+      pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
     }
     dst += stride;
   }
@@ -968,6 +1022,12 @@ void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
 }
 
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -992,10 +1052,10 @@ void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
 }
 
-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
 }
 
 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
@@ -1004,697 +1064,1934 @@ void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
 }
 
-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
 }
 
-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+// -----------------------------------------------------------------------------
+// Smooth horizontal/vertical helper functions.
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+static AOM_FORCE_INLINE void write_smooth_directional_sum16(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
+    const __m128i weights1, const __m128i weights2,
+    const __m128i scaled_corner1, const __m128i scaled_corner2,
+    const __m128i round) {
+  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+  StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
+}
+
+static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
+    const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
+  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+  return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+static AOM_FORCE_INLINE void write_smooth_directional_sum8(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
+    const __m128i *scaled_corner, const __m128i *round) {
+  const __m128i pred_sum =
+      smooth_directional_sum8(*pixels, *weights, *scaled_corner);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
+  StoreLo8(dst, _mm_packus_epi16(pred, pred));
 }
 
 // -----------------------------------------------------------------------------
 // SMOOTH_V_PRED
 
-// pixels[0]: above and below_pred interleave vector
-static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
+static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
+    const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
+    const int height, __m128i *pixels) {
+  __m128i top = Load4(above);
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  top = cvtepu8_epi16(top);
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
 }
 
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vector
-static INLINE void load_weight_v_w4(int height, __m128i *weights) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
+    const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
+    __m128i *weights) {
+  const __m128i inverter = _mm_set1_epi16(256);
 
   if (height == 4) {
-    const __m128i weight =
-        _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
+    const __m128i weight = Load4(weight_array);
+    weights[0] = cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
   } else if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
+    const __m128i weight = LoadLo8(weight_array + 4);
+    weights[0] = cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
   } else {
-    const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
+    const __m128i weight = LoadUnaligned16(weight_array + 12);
+    const __m128i zero = _mm_setzero_si128();
+    weights[0] = cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
     weights[2] = _mm_unpackhi_epi8(weight, zero);
-    weights[3] = _mm_sub_epi16(d, weights[2]);
+    weights[3] = _mm_sub_epi16(inverter, weights[2]);
   }
 }
 
-static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round =
-      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
+static AOM_FORCE_INLINE void write_smooth_vertical4xh(
+    const __m128i *pixel, const __m128i *weight, const int height,
+    uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32(128);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int y = 0; y < height; ++y) {
+    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+    const __m128i alternate_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+    // The madd instruction yields four results of the form:
+    // (top_row[x] * weight[y] + corner * inverted_weight[y])
+    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
     sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, SMOOTH_WEIGHT_LOG2_SCALE);
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(int *)dst = _mm_cvtsi128_si32(sum);
+    sum = _mm_srai_epi32(sum, 8);
+    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+    Store4(dst, sum);
     dst += stride;
-    d = _mm_add_epi16(d, inc);
+    y_select = _mm_add_epi16(y_select, mask_increment);
   }
 }
 
-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
+void aom_smooth_v_predictor_4x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
   __m128i pixels;
-  load_pixel_v_w4(above, left, 4, &pixels);
+  load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
 
   __m128i weights[2];
-  load_weight_v_w4(4, weights);
+  load_smooth_vertical_weights4(smooth_weights, 4, weights);
 
-  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
+  write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
 }
 
-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
+void aom_smooth_v_predictor_4x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
   __m128i pixels;
-  load_pixel_v_w4(above, left, 8, &pixels);
+  load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
 
   __m128i weights[2];
-  load_weight_v_w4(8, weights);
+  load_smooth_vertical_weights4(smooth_weights, 8, weights);
 
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
 }
 
-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
+void aom_smooth_v_predictor_4x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
   __m128i pixels;
-  load_pixel_v_w4(above, left, 16, &pixels);
+  load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
 
   __m128i weights[4];
-  load_weight_v_w4(16, weights);
+  load_smooth_vertical_weights4(smooth_weights, 16, weights);
 
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
   dst += stride << 3;
-  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
+  write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+}
+
+void aom_smooth_v_predictor_8x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
 }
 
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
+void aom_smooth_v_predictor_8x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
 }
 
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-static INLINE void load_weight_v_w8(int height, __m128i *weight_h) {
+void aom_smooth_v_predictor_8x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-
-  if (height < 16) {
-    const int offset = height < 8 ? 0 : 4;
-    const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&smooth_weights[offset]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
   }
-}
-
-static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                     int h, uint8_t *dst, ptrdiff_t stride) {
-  const __m128i pred_round =
-      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    s0 = _mm_add_epi32(s0, pred_round);
-    s0 = _mm_srai_epi32(s0, SMOOTH_WEIGHT_LOG2_SCALE);
-
-    s1 = _mm_add_epi32(s1, pred_round);
-    s1 = _mm_srai_epi32(s1, SMOOTH_WEIGHT_LOG2_SCALE);
-
-    __m128i sum01 = _mm_packus_epi16(s0, s1);
-    sum01 = _mm_shuffle_epi8(sum01, gat);
-    _mm_storel_epi64((__m128i *)dst, sum01);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
     dst += stride;
-
-    d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 4, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(4, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 8, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(8, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 16, pixels);
-
-  __m128i wh[4];
-  load_weight_v_w8(16, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 32, pixels);
-
-  __m128i wh[8];
-  load_weight_v_w8(32, wh);
-
-  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
+void aom_smooth_v_predictor_16x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+}
+
+void aom_smooth_v_predictor_16x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
 }
 
-static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+void aom_smooth_v_predictor_16x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
   const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i bottom_left =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round =
-      _mm_set1_epi32((uint16_t)(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i scale_m_weights_y =
-        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      // 8 -> 16
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
-      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
-      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
-      // top_x * weights_y + scale_m_weights_y * bottom_left
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      pred_lo = _mm_add_epi32(pred_lo, round);
-      pred_hi = _mm_add_epi32(pred_hi, round);
-      pred_lo = _mm_srai_epi32(pred_lo, SMOOTH_WEIGHT_LOG2_SCALE);
-      pred_hi = _mm_srai_epi32(pred_hi, SMOOTH_WEIGHT_LOG2_SCALE);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i weights_lo = cvtepu8_epi16(weights);
+  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+  const __m128i scaled_bottom_left_lo =
+      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+  const __m128i scaled_bottom_left_hi =
+      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
     dst += stride;
   }
 }
 
-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_H_PRED
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  if (height == 4)
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
-  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+void aom_smooth_v_predictor_16x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
 }
 
-// weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(int height, __m128i *weights) {
-  (void)height;
-  const __m128i t = _mm_loadu_si128((const __m128i *)&smooth_weights[0]);
+void aom_smooth_v_predictor_16x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i round = _mm_set1_epi16(128);
   const __m128i zero = _mm_setzero_si128();
-
-  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
-  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  const uint8_t *weights_base_ptr = smooth_weights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+  }
 }
 
-static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round =
-      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
-    b = _mm_unpacklo_epi16(b, pixel[1]);
-    __m128i sum = _mm_madd_epi16(b, weight[0]);
-
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, SMOOTH_WEIGHT_LOG2_SCALE);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(int *)dst = _mm_cvtsi128_si32(sum);
+void aom_smooth_v_predictor_32x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
     dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
   }
 }
 
-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 4, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(4, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 8, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+void aom_smooth_v_predictor_32x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 16, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-  dst += stride << 3;
-
-  pixels[0] = _mm_srli_si128(pixels[0], 8);
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+void aom_smooth_v_predictor_32x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
 }
 
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-// pixels[2]: left vector + 16
-// pixels[3]: right_pred vector
-static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[3] = pixels[1];
+void aom_smooth_v_predictor_32x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const uint8_t *weights_base_ptr = smooth_weights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
   }
 }
 
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_h_w8(int height, __m128i *weight_w) {
-  (void)height;
+void aom_smooth_v_predictor_64x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[4]);
-  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-}
-
-static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
-                                     int h, uint8_t *dst, ptrdiff_t stride,
-                                     int second_half) {
-  const __m128i pred_round =
-      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
-                            : _mm_set1_epi16((short)0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
-    b = _mm_unpacklo_epi16(b, pixels[1]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    sum0 = _mm_add_epi32(sum0, pred_round);
-    sum0 = _mm_srai_epi32(sum0, SMOOTH_WEIGHT_LOG2_SCALE);
-
-    sum1 = _mm_add_epi32(sum1, pred_round);
-    sum1 = _mm_srai_epi32(sum1, SMOOTH_WEIGHT_LOG2_SCALE);
-
-    sum0 = _mm_packus_epi16(sum0, sum1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
+  const __m128i top_lolo = LoadUnaligned16(top_row);
+  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+  const __m128i top5 = cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
     dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
   }
 }
 
-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 4, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(4, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 8, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(8, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+void aom_smooth_v_predictor_64x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i top_lolo = LoadUnaligned16(top_row);
+  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+  const __m128i top5 = cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 16, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(16, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
+void aom_smooth_v_predictor_64x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+  const __m128i top_lolo = LoadUnaligned16(top_row);
+  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+  const __m128i top5 = cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t *weights_base_ptr = smooth_weights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+  }
 }
 
-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_h_w8(above, left, 32, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(32, ww);
-
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
+    const __m128i *scaled_top_right, const __m128i *round) {
+  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
+  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i left = cvtepu8_epi32(Load4(left_column));
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi32(Load4(left_column));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 4));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi32(Load4(left_column));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 4));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 8));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 12));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+void aom_smooth_h_predictor_8x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i left = cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+}
+
+void aom_smooth_h_predictor_8x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
 }
 
-static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i pred_round =
-      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i tr_ly =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
-      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
-      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
-      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
-      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
-      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_round);
-      pred_hi = _mm_add_epi32(pred_hi, pred_round);
-
-      pred_lo = _mm_srai_epi32(pred_lo, SMOOTH_WEIGHT_LOG2_SCALE);
-      pred_hi = _mm_srai_epi32(pred_hi, SMOOTH_WEIGHT_LOG2_SCALE);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
+void aom_smooth_h_predictor_8x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
     dst += stride;
   }
 }
 
-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+void aom_smooth_h_predictor_8x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+void aom_smooth_h_predictor_16x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i left = cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i y_mask = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+void aom_smooth_h_predictor_16x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+void aom_smooth_h_predictor_16x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+void aom_smooth_h_predictor_16x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                     scaled_top_right1, scaled_top_right2,
+                                     round);
+      dst += stride;
+    }
+  }
 }
 
-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+void aom_smooth_h_predictor_32x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+void aom_smooth_h_predictor_32x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  const __m128i left2 =
+      cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+void aom_smooth_h_predictor_32x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+void aom_smooth_h_predictor_32x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                     scaled_top_right1, scaled_top_right2,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+                                     weights4, scaled_top_right3,
+                                     scaled_top_right4, round);
+      dst += stride;
+    }
+  }
 }
 
-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+void aom_smooth_h_predictor_64x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
+  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+void aom_smooth_h_predictor_64x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
+  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
 }
 
-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
+void aom_smooth_h_predictor_64x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
+  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                     scaled_top_right1, scaled_top_right2,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+                                     weights4, scaled_top_right3,
+                                     scaled_top_right4, round);
+      write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
+                                     weights6, scaled_top_right5,
+                                     scaled_top_right6, round);
+      write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
+                                     weights8, scaled_top_right7,
+                                     scaled_top_right8, round);
+      dst += stride;
+    }
+  }
 }
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index 6ec5dd8c1..dd798ca54 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -49,13 +49,12 @@ void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
                                       int ref_stride,
                                       const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+  const int8_t w1 = (int8_t)jcp_param->bck_offset;
   const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
                                  w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
 
   if (width >= 16) {
     // Read 16 pixels one row at a time
@@ -95,10 +94,10 @@ void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
     assert(!(width & 3));
     assert(!(height & 3));
     for (i = 0; i < height; i += 4) {
-      const uint8_t *row0 = ref + 0 * ref_stride;
-      const uint8_t *row1 = ref + 1 * ref_stride;
-      const uint8_t *row2 = ref + 2 * ref_stride;
-      const uint8_t *row3 = ref + 3 * ref_stride;
+      const int8_t *row0 = (const int8_t *)ref + 0 * ref_stride;
+      const int8_t *row1 = (const int8_t *)ref + 1 * ref_stride;
+      const int8_t *row2 = (const int8_t *)ref + 2 * ref_stride;
+      const int8_t *row3 = (const int8_t *)ref + 3 * ref_stride;
 
       __m128i p0 =
           _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
diff --git a/aom_dsp/x86/loopfilter_avx2.c b/aom_dsp/x86/loopfilter_avx2.c
index b59381928..6e77742e3 100644
--- a/aom_dsp/x86/loopfilter_avx2.c
+++ b/aom_dsp/x86/loopfilter_avx2.c
@@ -32,7 +32,7 @@ void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
   const __m128i blimit_v =
       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
 
   p256_2 =
@@ -239,7 +239,7 @@ void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
   const __m128i blimit_v =
       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
 
   p256_3 =
@@ -472,51 +472,162 @@ void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
   }
 }
 
+static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
+                                                unsigned char *out, int out_p,
+                                                int is_store_avx2) {
+  const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
+  const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1));
+  const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2));
+  const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3));
+  const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4));
+  const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5));
+  const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6));
+  const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7));
+
+  const __m256i y0 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)),
+      0x1);
+  const __m256i y1 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)),
+      0x1);
+  const __m256i y2 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)),
+      0x1);
+  const __m256i y3 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)),
+      0x1);
+  const __m256i y4 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)),
+      0x1);
+  const __m256i y5 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)),
+      0x1);
+  const __m256i y6 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)),
+      0x1);
+  const __m256i y7 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)),
+      0x1);
+
+  const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1);
+  const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1);
+  const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3);
+  const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3);
+  const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5);
+  const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5);
+  const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7);
+  const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7);
+
+  const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02);
+  const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02);
+  const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03);
+  const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03);
+  const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06);
+  const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06);
+  const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07);
+  const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07);
+
+  const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14);
+  const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14);
+  const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15);
+  const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15);
+  const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16);
+  const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16);
+  const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17);
+  const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17);
+
+  const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8);
+  const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8);
+  const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8);
+  const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8);
+  const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8);
+  const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8);
+  const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8);
+  const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8);
+
+  if (is_store_avx2) {
+    _mm256_storeu_si256((__m256i *)(out), row_s01);
+    _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23);
+    _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45);
+    _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67);
+    _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89);
+    _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011);
+    _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213);
+    _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415);
+  } else {
+    _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01));
+    _mm_storeu_si128((__m128i *)(out + (2 * out_p)),
+                     _mm256_castsi256_si128(row_s23));
+    _mm_storeu_si128((__m128i *)(out + (4 * out_p)),
+                     _mm256_castsi256_si128(row_s45));
+    _mm_storeu_si128((__m128i *)(out + (6 * out_p)),
+                     _mm256_castsi256_si128(row_s67));
+    _mm_storeu_si128((__m128i *)(out + (8 * out_p)),
+                     _mm256_castsi256_si128(row_s89));
+    _mm_storeu_si128((__m128i *)(out + (10 * out_p)),
+                     _mm256_castsi256_si128(row_s1011));
+    _mm_storeu_si128((__m128i *)(out + (12 * out_p)),
+                     _mm256_castsi256_si128(row_s1213));
+    _mm_storeu_si128((__m128i *)(out + (14 * out_p)),
+                     _mm256_castsi256_si128(row_s1415));
+    _mm_storeu_si128((__m128i *)(out + (1 * out_p)),
+                     _mm256_extracti128_si256(row_s01, 1));
+    _mm_storeu_si128((__m128i *)(out + (3 * out_p)),
+                     _mm256_extracti128_si256(row_s23, 1));
+    _mm_storeu_si128((__m128i *)(out + (5 * out_p)),
+                     _mm256_extracti128_si256(row_s45, 1));
+    _mm_storeu_si128((__m128i *)(out + (7 * out_p)),
+                     _mm256_extracti128_si256(row_s67, 1));
+    _mm_storeu_si128((__m128i *)(out + (9 * out_p)),
+                     _mm256_extracti128_si256(row_s89, 1));
+    _mm_storeu_si128((__m128i *)(out + (11 * out_p)),
+                     _mm256_extracti128_si256(row_s1011, 1));
+    _mm_storeu_si128((__m128i *)(out + (13 * out_p)),
+                     _mm256_extracti128_si256(row_s1213, 1));
+    _mm_storeu_si128((__m128i *)(out + (15 * out_p)),
+                     _mm256_extracti128_si256(row_s1415, 1));
+  }
+}
+
 void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
                                      const unsigned char *_blimit0,
                                      const unsigned char *_limit0,
                                      const unsigned char *_thresh0) {
-  __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i mask, flat, flat2;
-
-  const __m128i thresh_v =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
-  const __m128i limit_v =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
-  const __m128i blimit_v =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
-  const __m128i zero = _mm_set1_epi16(0);
+  __m128i mask, flat;
+  const __m128i zero = _mm_setzero_si128();
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
 
-  p256_3 =
+  __m256i p256_3 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
-  p256_2 =
+  __m256i p256_2 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
-  p256_1 =
+  __m256i p256_1 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
-  p256_0 =
+  __m256i p256_0 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
-  q256_0 =
+  __m256i q256_0 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
-  q256_1 =
+  __m256i q256_1 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
-  q256_2 =
+  __m256i q256_2 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
-  q256_3 =
+  __m256i q256_3 =
       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
 
-  p3 = _mm256_castsi256_si128(p256_3);
-  p2 = _mm256_castsi256_si128(p256_2);
-  p1 = _mm256_castsi256_si128(p256_1);
-  p0 = _mm256_castsi256_si128(p256_0);
-  q0 = _mm256_castsi256_si128(q256_0);
-  q1 = _mm256_castsi256_si128(q256_1);
-  q2 = _mm256_castsi256_si128(q256_2);
-  q3 = _mm256_castsi256_si128(q256_3);
+  __m128i p3 = _mm256_castsi256_si128(p256_3);
+  __m128i p2 = _mm256_castsi256_si128(p256_2);
+  __m128i p1 = _mm256_castsi256_si128(p256_1);
+  __m128i p0 = _mm256_castsi256_si128(p256_0);
+  __m128i q0 = _mm256_castsi256_si128(q256_0);
+  __m128i q1 = _mm256_castsi256_si128(q256_1);
+  __m128i q2 = _mm256_castsi256_si128(q256_2);
+  __m128i q3 = _mm256_castsi256_si128(q256_3);
 
   {
-    __m128i work;
+    const __m128i limit_v =
+        _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+    const __m128i blimit_v =
+        _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i abs_p1p0 =
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
@@ -537,7 +648,7 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
     mask = _mm_max_epu8(flat, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
+    __m128i work = _mm_max_epu8(
         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
     mask = _mm_max_epu8(work, mask);
@@ -553,39 +664,33 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
 
   // loop filter
   {
-    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i thresh_v =
+        _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+    const __m128i one = _mm_set1_epi8(1);
     const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t4 = _mm_add_epi8(one, t3);
     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i hev;
+    const __m128i t7f = _mm_sub_epi8(t80, one);
 
-    hev = _mm_subs_epu8(flat, thresh_v);
+    __m128i hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     __m128i ps1 = _mm_xor_si128(p1, t80);
     __m128i ps0 = _mm_xor_si128(p0, t80);
     __m128i qs0 = _mm_xor_si128(q0, t80);
     __m128i qs1 = _mm_xor_si128(q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, flat2_p0,
-        flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, flat_p2,
-        flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
 
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
+    __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    __m128i work_a = _mm_subs_epi8(qs0, ps0);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_and_si128(filt, mask);
 
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
+    __m128i filter1 = _mm_adds_epi8(filt, t4);
+    __m128i filter2 = _mm_adds_epi8(filt, t3);
 
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
@@ -601,7 +706,7 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
     filter2 = _mm_or_si128(filter2, work_a);
     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
-    filt = _mm_adds_epi8(filter1, t1);
+    filt = _mm_adds_epi8(filter1, one);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
     work_a = _mm_and_si128(work_a, t80);
@@ -611,15 +716,23 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
 
-    __m128i work;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
+    // Derive flat
+    __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0);
+    __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0);
+    __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0);
+    const __m256i ps0qs0256 =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1);
+    const __m256i ps1qs1256 =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1);
+    const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256),
+                                           _mm256_subs_epu8(p0q0256, p2q2256));
+    const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256),
+                                           _mm256_subs_epu8(p0q0256, p3q3256));
+    const __m256i max0_256 = _mm256_max_epu8(work01, work02);
+    const __m128i max1_256 =
+        _mm_max_epu8(_mm256_castsi256_si128(max0_256),
+                     _mm256_extractf128_si256(max0_256, 1));
+    flat = _mm_max_epu8(max1_256, flat);
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
@@ -627,47 +740,38 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     // flat and wide flat calculations
     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i flat256 =
+          _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1);
       const __m256i eight = _mm256_set1_epi16(8);
       const __m256i four = _mm256_set1_epi16(4);
-      __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, p2p1p0, q2q1q0,
-          pixetFilter_q2q1q0, sum_p, sum_q, res_p, res_q;
-      __m256i p256_4, q256_4, p256_5, q256_5, p256_6, q256_6;
-      __m128i p4, q4, p5, q5, p6, q6;
 
-      p256_4 = _mm256_castpd_si256(
+      __m256i p256_4 = _mm256_castpd_si256(
           _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
-      q256_4 = _mm256_castpd_si256(
+      __m256i q256_4 = _mm256_castpd_si256(
           _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
-      p4 = _mm256_castsi256_si128(p256_4);
-      q4 = _mm256_castsi256_si128(q256_4);
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
-          _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
-
-      p256_5 = _mm256_castpd_si256(
+      __m256i p256_5 = _mm256_castpd_si256(
           _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
-      q256_5 = _mm256_castpd_si256(
+      __m256i q256_5 = _mm256_castpd_si256(
           _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
-      p5 = _mm256_castsi256_si128(p256_5);
-      q5 = _mm256_castsi256_si128(q256_5);
-      flat2 = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
-          _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
-      flat2 = _mm_max_epu8(work, flat2);
-
-      p256_6 = _mm256_castpd_si256(
+      __m256i p256_6 = _mm256_castpd_si256(
           _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
-      q256_6 = _mm256_castpd_si256(
+      __m256i q256_6 = _mm256_castpd_si256(
           _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
-      p6 = _mm256_castsi256_si128(p256_6);
-      q6 = _mm256_castsi256_si128(q256_6);
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
-          _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
-      flat2 = _mm_max_epu8(work, flat2);
 
+      // Derive flat2
+      __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0);
+      __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0);
+      const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0);
+      const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256),
+                                            _mm256_subs_epu8(p0q0256, p4q4256));
+      const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256),
+                                            _mm256_subs_epu8(p0q0256, p5q5256));
+      const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256),
+                                            _mm256_subs_epu8(p0q0256, p6q6256));
+      __m256i flat2_256 = _mm256_max_epu8(work1, work2);
+      flat2_256 = _mm256_max_epu8(flat2_256, work3);
+      __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256),
+                                   _mm256_extractf128_si256(flat2_256, 1));
       flat2 = _mm_subs_epu8(flat2, one);
       flat2 = _mm_cmpeq_epi8(flat2, zero);
       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
@@ -684,72 +788,66 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
       q256_2 = _mm256_shuffle_epi8(q256_2, filter);
       q256_3 = _mm256_shuffle_epi8(q256_3, filter);
 
-      p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
-      q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+      const __m256i p2p1p0 =
+          _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+      const __m256i q2q1q0 =
+          _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
 
-      pixetFilter_p2p1p0 =
+      __m256i pixetFilter_p2p1p0 =
           _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
-      pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+      __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
 
+      // Derive p0 and q0
       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
-      res_p =
+      __m256i res_p =
           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
-      flat_p0 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-      p0 = _mm_andnot_si128(flat, ps0);
-      flat_p0 = _mm_and_si128(flat, flat_p0);
-      p0 = _mm_or_si128(flat_p0, p0);
-
       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
-      res_q =
+      __m256i res_q =
           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
-      flat_q0 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-      q0 = _mm_andnot_si128(flat, qs0);
-      flat_q0 = _mm_and_si128(flat, flat_q0);
-      q0 = _mm_or_si128(flat_q0, q0);
-
-      sum_p = _mm256_sub_epi16(p256_3, q256_2);
+      __m256i flat_p0q0 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+      p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256);
+      flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0);
+      p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256);
+      p0 = _mm256_castsi256_si128(p0q0256);
+      q0 = _mm256_extractf128_si256(p0q0256, 1);
+
+      // Derive p1 and q1
+      __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2);
       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
       res_p =
           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
-      flat_p1 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-      p1 = _mm_andnot_si128(flat, ps1);
-      flat_p1 = _mm_and_si128(flat, flat_p1);
-      p1 = _mm_or_si128(flat_p1, p1);
-
-      sum_q = _mm256_sub_epi16(q256_3, p256_2);
+      __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2);
       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
       res_q =
           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
-      flat_q1 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-      q1 = _mm_andnot_si128(flat, qs1);
-      flat_q1 = _mm_and_si128(flat, flat_q1);
-      q1 = _mm_or_si128(flat_q1, q1);
-
+      __m256i flat_p1q1 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+      __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256);
+      flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1);
+      p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256);
+      p1 = _mm256_castsi256_si128(p1q1256);
+      q1 = _mm256_extractf128_si256(p1q1256, 1);
+
+      // Derive p2 and q2
       sum_p = _mm256_sub_epi16(p256_3, q256_1);
       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
       res_p =
           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
-      flat_p2 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-      p2 = _mm_andnot_si128(flat, p2);
-      flat_p2 = _mm_and_si128(flat, flat_p2);
-      p2 = _mm_or_si128(flat_p2, p2);
-
       sum_q = _mm256_sub_epi16(q256_3, p256_1);
       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
       res_q =
           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
-      flat_q2 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-      q2 = _mm_andnot_si128(flat, q2);
-      flat_q2 = _mm_and_si128(flat, flat_q2);
-      q2 = _mm_or_si128(flat_q2, q2);
-
+      __m256i flat_p2q2 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+      p2q2256 = _mm256_andnot_si256(flat256, p2q2256);
+      flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2);
+      p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256);
+      p2 = _mm256_castsi256_si128(p2q2256);
+      q2 = _mm256_extractf128_si256(p2q2256, 1);
       if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+        flat2_256 =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1);
         p256_6 = _mm256_shuffle_epi8(p256_6, filter);
         p256_5 = _mm256_shuffle_epi8(p256_5, filter);
         p256_4 = _mm256_shuffle_epi8(p256_4, filter);
@@ -757,9 +855,9 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
         q256_5 = _mm256_shuffle_epi8(q256_5, filter);
         q256_6 = _mm256_shuffle_epi8(q256_6, filter);
 
-        pixelFilter_p =
+        __m256i pixelFilter_p =
             _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
-        pixelFilter_q =
+        __m256i pixelFilter_q =
             _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
 
         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
@@ -771,135 +869,118 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
             eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
         pixelFilter_q = pixelFilter_p;
 
+        // Derive p0 and q0
         pixelFilter_p =
             _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
-        flat2_p0 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-        p0 = _mm_andnot_si128(flat2, p0);
-        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
-        p0 = _mm_or_si128(flat2_p0, p0);
-        _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-
         pixelFilter_q =
             _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
-        flat2_q0 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-        q0 = _mm_andnot_si128(flat2, q0);
-        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
-        q0 = _mm_or_si128(flat2_q0, q0);
+        __m256i flat2_p0q0 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256);
+        flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0);
+        p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256);
+
+        p0 = _mm256_castsi256_si128(p0q0256);
+        q0 = _mm256_extractf128_si256(p0q0256, 1);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
         _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
+        // Derive p1 and q1
         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
                                  _mm256_sub_epi16(p256_2, q256_0));
         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
-        flat2_p1 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-        p1 = _mm_andnot_si128(flat2, p1);
-        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
-        p1 = _mm_or_si128(flat2_p1, p1);
-        _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-
         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
                                  _mm256_sub_epi16(q256_2, p256_0));
         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
-        flat2_q1 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-        q1 = _mm_andnot_si128(flat2, q1);
-        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
-        q1 = _mm_or_si128(flat2_q1, q1);
+        __m256i flat2_p1q1 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256);
+        flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1);
+        p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256);
+        p1 = _mm256_castsi256_si128(p1q1256);
+        q1 = _mm256_extractf128_si256(p1q1256, 1);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
         _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
+        // Derive p2 and q2
         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
                                  _mm256_sub_epi16(p256_3, p256_0));
         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
-        flat2_p2 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-        p2 = _mm_andnot_si128(flat2, p2);
-        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
-        p2 = _mm_or_si128(flat2_p2, p2);
-        _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-
         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
                                  _mm256_sub_epi16(q256_3, q256_0));
         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
-        flat2_q2 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-        q2 = _mm_andnot_si128(flat2, q2);
-        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
-        q2 = _mm_or_si128(flat2_q2, q2);
+        __m256i flat2_p2q2 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256);
+        flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2);
+        p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256);
+        p2 = _mm256_castsi256_si128(p2q2256);
+        q2 = _mm256_extractf128_si256(p2q2256, 1);
+        _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
         _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
 
+        // Derive p3 and q3
         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
                                  _mm256_sub_epi16(p256_4, p256_1));
         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
-        flat2_p3 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-        p3 = _mm_andnot_si128(flat2, p3);
-        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
-        p3 = _mm_or_si128(flat2_p3, p3);
-        _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
-
         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
                                  _mm256_sub_epi16(q256_4, q256_1));
         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
-        flat2_q3 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-        q3 = _mm_andnot_si128(flat2, q3);
-        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
-        q3 = _mm_or_si128(flat2_q3, q3);
+        __m256i flat2_p3q3 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256);
+        flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3);
+        p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256);
+        p3 = _mm256_castsi256_si128(p3q3256);
+        q3 = _mm256_extractf128_si256(p3q3256, 1);
+        _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
         _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
 
+        // Derive p4 and q4
         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
                                  _mm256_sub_epi16(p256_5, p256_2));
         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
-        flat2_p4 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-        p4 = _mm_andnot_si128(flat2, p4);
-        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
-        p4 = _mm_or_si128(flat2_p4, p4);
-        _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
-
         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
                                  _mm256_sub_epi16(q256_5, q256_2));
         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
-        flat2_q4 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-        q4 = _mm_andnot_si128(flat2, q4);
-        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
-        q4 = _mm_or_si128(flat2_q4, q4);
-        _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
-
+        __m256i flat2_p4q4 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256);
+        flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4);
+        p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256);
+        _mm_storeu_si128((__m128i *)(s - 5 * p),
+                         _mm256_castsi256_si128(p4q4256));
+        _mm_storeu_si128((__m128i *)(s + 4 * p),
+                         _mm256_extractf128_si256(p4q4256, 1));
+
+        // Derive p5 and q5
         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
                                  _mm256_sub_epi16(p256_6, p256_3));
         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
-        flat2_p5 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-        p5 = _mm_andnot_si128(flat2, p5);
-        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
-        p5 = _mm_or_si128(flat2_p5, p5);
-        _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
-
         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
                                  _mm256_sub_epi16(q256_6, q256_3));
         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
-        flat2_q5 = _mm256_castsi256_si128(
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-        q5 = _mm_andnot_si128(flat2, q5);
-        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
-        q5 = _mm_or_si128(flat2_q5, q5);
-        _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+        __m256i flat2_p5q5 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256);
+        flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5);
+        p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256);
+        _mm_storeu_si128((__m128i *)(s - 6 * p),
+                         _mm256_castsi256_si128(p5q5256));
+        _mm_storeu_si128((__m128i *)(s + 5 * p),
+                         _mm256_extractf128_si256(p5q5256, 1));
       } else {
         _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
         _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
@@ -916,3 +997,20 @@ void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
     }
   }
 }
+
+void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch,
+                                   const uint8_t *_blimit0,
+                                   const uint8_t *_limit0,
+                                   const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+  // Transpose 16x16
+  trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1);
+
+  // Loop filtering
+  aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0,
+                                  _thresh0);
+
+  // Transpose back
+  trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0);
+}
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 87c5bb32a..cdf24c332 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -2133,7 +2133,7 @@ void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
                                      const unsigned char *_blimit0,
                                      const unsigned char *_limit0,
                                      const unsigned char *_thresh0) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
@@ -2438,7 +2438,7 @@ void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
@@ -2630,7 +2630,7 @@ void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
@@ -2802,7 +2802,7 @@ void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
   const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
@@ -2928,22 +2928,15 @@ void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
                                   const uint8_t *_limit0,
                                   const uint8_t *_thresh0) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
 
   // Transpose 16x8
   transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose_8xn(src, 16, dst, pitch, 2);
+  transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
 }
 
 void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
@@ -2951,22 +2944,15 @@ void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
                                   const uint8_t *_limit0,
                                   const uint8_t *_thresh0) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
 
-  // Transpose 16x8
+  // Transpose 16x8:: (wxh) 8x16 to 16x8
   transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + pitch * 8;
 
-  // Transpose back
-  transpose_8xn(src, 16, dst, pitch, 2);
+  // Transpose back:: (wxh) 16x8 to 8x16
+  transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
 }
 
 void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
@@ -2974,8 +2960,6 @@ void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
                                   const uint8_t *_limit0,
                                   const uint8_t *_thresh0) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
 
   // Transpose 16x8
   transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
@@ -2983,12 +2967,7 @@ void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
   // Loop filtering
   aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
                                  _thresh0);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose_8xn(src, 16, dst, pitch, 2);
+  transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
 }
diff --git a/aom_dsp/x86/lpf_common_sse2.h b/aom_dsp/x86/lpf_common_sse2.h
index 84daa94b0..45464e80b 100644
--- a/aom_dsp/x86/lpf_common_sse2.h
+++ b/aom_dsp/x86/lpf_common_sse2.h
@@ -564,6 +564,94 @@ static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 }
 
+static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
+                                          unsigned char *dst, int out_p) {
+  // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
+  // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
+  // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
+  // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
+  // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
+  // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
+  // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
+  // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
+  const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
+  const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
+  const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
+  const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
+  const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
+  const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
+  const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
+  const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
+
+  // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
+  // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
+  // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
+  // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
+  // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
+  // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
+  // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
+  // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
+  const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
+  const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
+  const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
+  const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
+  const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
+  const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
+  const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
+  const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
+
+  // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
+  // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
+  // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
+  // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
+  // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
+  // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
+  // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
+  // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
+  const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
+  const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
+  const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
+  const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
+  const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
+  const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
+  const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
+  const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
+
+  // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
+  // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
+  // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
+  // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
+  // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
+  // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
+  // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
+  // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
+  const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
+  const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
+  const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
+  const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
+  const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
+  const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
+  const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
+  const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
+
+  mm_storelu(dst, x_s30);
+  mm_storehu(dst + (1 * out_p), x_s30);
+  mm_storelu(dst + (2 * out_p), x_s31);
+  mm_storehu(dst + (3 * out_p), x_s31);
+  mm_storelu(dst + (4 * out_p), x_s32);
+  mm_storehu(dst + (5 * out_p), x_s32);
+  mm_storelu(dst + (6 * out_p), x_s33);
+  mm_storehu(dst + (7 * out_p), x_s33);
+  mm_storelu(dst + (8 * out_p), x_s34);
+  mm_storehu(dst + (9 * out_p), x_s34);
+  mm_storelu(dst + (10 * out_p), x_s35);
+  mm_storehu(dst + (11 * out_p), x_s35);
+  mm_storelu(dst + (12 * out_p), x_s36);
+  mm_storehu(dst + (13 * out_p), x_s36);
+  mm_storelu(dst + (14 * out_p), x_s37);
+  mm_storehu(dst + (15 * out_p), x_s37);
+}
+
 static INLINE void transpose_8xn(unsigned char *src[], int in_p,
                                  unsigned char *dst[], int out_p,
                                  int num_8x8_to_transpose) {
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index 1235f2779..799ce9ef4 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -153,15 +153,15 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   _mm_storeu_si128((__m128i *)sad_array, res0);
 }
 
-#define MASK_SAD4XH_ONE_REF(idx)                                               \
-  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx),             \
-                         _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \
-  data = _mm_unpacklo_epi8(a, b);                                              \
-  mask = _mm_unpacklo_epi8(m, m_inv);                                          \
-  pred = _mm_maddubs_epi16(data, mask);                                        \
-  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                      \
-                                                                               \
-  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                          \
+#define MASK_SAD4XH_ONE_REF(idx)                                          \
+  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx),             \
+                         _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
+  data = _mm_unpacklo_epi8(a, b);                                         \
+  mask = _mm_unpacklo_epi8(m, m_inv);                                     \
+  pred = _mm_maddubs_epi16(data, mask);                                   \
+  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                 \
+                                                                          \
+  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                     \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
 void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -182,15 +182,15 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
 
   for (int y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
-        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i src =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
     const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
     const __m128i m_copy =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
 
     __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
     __m128i m = inv_mask ? m_inv : m_copy;
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index fd5352cc1..df3a8764e 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -194,18 +194,18 @@ unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
   for (y = 0; y < height; y += 2) {
     // Load two rows at a time, this seems to be a bit faster
     // than four rows at a time in this case.
-    const __m128i src = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
-        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i src =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
     const __m128i a =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr),
+                           _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride]));
     const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
     const __m128i m =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
     const __m128i m_inv = _mm_sub_epi8(mask_max, m);
 
     const __m128i data = _mm_unpacklo_epi8(a, b);
@@ -367,9 +367,8 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
                            _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
     // Zero-extend mask to 16 bits
     const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+                           _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
         _mm_setzero_si128());
     const __m128i m_inv = _mm_sub_epi16(mask_max, m);
 
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index ac0e576bb..0bf383fff 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -312,7 +312,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
     uint8_t *b = dst;
     for (i = 0; i < h + 1; ++i) {
       __m128i x = xx_loadl_32((__m128i *)src);
-      xx_storel_32((__m128i *)b, x);
+      xx_storel_32(b, x);
       src += src_stride;
       b += 4;
     }
@@ -321,7 +321,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
     for (i = 0; i < h + 1; ++i) {
       __m128i x = _mm_loadl_epi64((__m128i *)src);
       __m128i z = _mm_srli_si128(x, 1);
-      xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
+      xx_storel_32(b, _mm_avg_epu8(x, z));
       src += src_stride;
       b += 4;
     }
@@ -357,7 +357,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
     v0 = _mm_maddubs_epi16(v0, hfilter_vec);
     v0 = xx_roundn_epu16(v0, FILTER_BITS);
 
-    xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
+    xx_storel_32(b, _mm_packus_epi16(v0, v0));
   }
 
   // Vertical filter
@@ -367,7 +367,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
     for (i = 0; i < h; ++i) {
       __m128i x = xx_loadl_32((__m128i *)dst);
       __m128i y = xx_loadl_32((__m128i *)&dst[4]);
-      xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
+      xx_storel_32(dst, _mm_avg_epu8(x, y));
       dst += 4;
     }
   } else {
@@ -494,15 +494,14 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
 
   for (y = 0; y < height; y += 4) {
     // Load four rows at a time
-    __m128i src =
-        _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
-                       *(uint32_t *)&src_ptr[src_stride * 2],
-                       *(uint32_t *)&src_ptr[src_stride * 3]);
+    __m128i src = _mm_setr_epi32(*(int *)src_ptr, *(int *)&src_ptr[src_stride],
+                                 *(int *)&src_ptr[src_stride * 2],
+                                 *(int *)&src_ptr[src_stride * 3]);
     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m = _mm_setr_epi32(
-        *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
-        *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
+    const __m128i m = _mm_setr_epi32(*(int *)m_ptr, *(int *)&m_ptr[m_stride],
+                                     *(int *)&m_ptr[m_stride * 2],
+                                     *(int *)&m_ptr[m_stride * 3]);
     accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
 
     src_ptr += src_stride * 4;
@@ -986,9 +985,8 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+                           _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
         zero);
     const __m128i m_inv = _mm_sub_epi16(mask_max, m);
 
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index dacb61364..085a572cb 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -19,20 +19,20 @@
 
 #include "aom/aom_integer.h"
 
-static INLINE uint16_t loadu_uint16(const void *src) {
-  uint16_t v;
+static INLINE int16_t loadu_int16(const void *src) {
+  int16_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE uint32_t loadu_uint32(const void *src) {
-  uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE uint64_t loadu_uint64(const void *src) {
-  uint64_t v;
+static INLINE int64_t loadu_int64(const void *src) {
+  int64_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
@@ -48,10 +48,10 @@ static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
 
 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
-  return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
-                        loadu_uint32((int8_t *)src + 1 * byte_stride),
-                        loadu_uint32((int8_t *)src + 2 * byte_stride),
-                        loadu_uint32((int8_t *)src + 3 * byte_stride));
+  return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
+                        loadu_int32((int8_t *)src + 1 * byte_stride),
+                        loadu_int32((int8_t *)src + 2 * byte_stride),
+                        loadu_int32((int8_t *)src + 3 * byte_stride));
 }
 
 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h
index 5181e444c..210f466b6 100644
--- a/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -28,7 +28,7 @@ static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
   assert(IS_POWER_OF_TWO(h));
 
   do {
-    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+    const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
     const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
     const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
 
diff --git a/aom_dsp/x86/quantize_avx2.c b/aom_dsp/x86/quantize_avx2.c
index 5763bd679..b808d4677 100644
--- a/aom_dsp/x86/quantize_avx2.c
+++ b/aom_dsp/x86/quantize_avx2.c
@@ -128,7 +128,7 @@ void aom_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *iscan) {
   (void)scan;
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
-  __m256i v_eobmax = _mm256_set1_epi16(0);
+  __m256i v_eobmax = _mm256_setzero_si256();
 
   load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
                      &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
@@ -211,7 +211,7 @@ static AOM_FORCE_INLINE void quantize_b_no_qmatrix_avx2(
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *iscan, int log_scale) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
-  __m256i v_eobmax = _mm256_set1_epi16(0);
+  __m256i v_eobmax = _mm256_setzero_si256();
 
   load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
                      &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 1abeb4c3d..0fea6ddfd 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -13,10 +13,38 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 
-void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
-                        uint32_t res[4]) {
+static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
+                                                     const __m256i *sum_ref0,
+                                                     const __m256i *sum_ref1,
+                                                     const __m256i *sum_ref2,
+                                                     const __m256i *sum_ref3) {
+  // In sum_ref-i the result is saved in the first 4 bytes and the other 4
+  // bytes are zeroed.
+  // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+  // 0, 0, 1, 1
+  __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1),
+      _MM_SHUFFLE(2, 0, 2, 0)));
+  // 2, 2, 3, 3
+  __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3),
+      _MM_SHUFFLE(2, 0, 2, 0)));
+
+  // sum adjacent 32 bit integers
+  __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23);
+
+  // add the low 128 bit to the high 128 bit
+  __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123),
+                              _mm256_extractf128_si256(sum_ref0123, 1));
+
+  _mm_storeu_si128((__m128i *)(res), sum);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
+    int M, int N, const uint8_t *src, int src_stride,
+    const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   int i, j;
@@ -57,32 +85,49 @@ void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
     ref2 += ref_stride;
     ref3 += ref_stride;
   }
-  {
-    __m128i sum;
-    __m256i sum_mlow, sum_mhigh;
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
+
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
+    int M, int N, const uint8_t *src, int src_stride,
+    const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2;
+  int i, j;
+  const uint8_t *ref0, *ref1, *ref2;
+  const __m256i zero = _mm256_setzero_si256();
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    }
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
   }
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
 }
 
 #define SADMXN_AVX2(m, n)                                                      \
@@ -90,6 +135,11 @@ void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[4], int ref_stride, \
                                   uint32_t res[4]) {                           \
     aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
+  }                                                                            \
+  void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
   }
 
 SADMXN_AVX2(32, 8)
@@ -129,3 +179,148 @@ SAD_SKIP_MXN_AVX2(64, 128)
 
 SAD_SKIP_MXN_AVX2(128, 64)
 SAD_SKIP_MXN_AVX2(128, 128)
+
+static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *const ref[4],
+                                                 int ref_stride,
+                                                 uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2;
+  const uint8_t *ref0, *ref1, *ref2;
+  const __m256i zero = _mm256_setzero_si256();
+  assert(N % 2 == 0);
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+
+  for (int i = 0; i < N; i += 2) {
+    // load src and all refs
+    src_reg = yy_loadu2_128(src + src_stride, src);
+    ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+    ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+    ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+
+    src += 2 * src_stride;
+    ref0 += 2 * ref_stride;
+    ref1 += 2 * ref_stride;
+    ref2 += 2 * ref_stride;
+  }
+
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *const ref[4],
+                                                 int ref_stride,
+                                                 uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  assert(N % 2 == 0);
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
+
+  for (int i = 0; i < N; i += 2) {
+    // load src and all refs
+    src_reg = yy_loadu2_128(src + src_stride, src);
+    ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+    ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+    ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+    ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
+
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src += 2 * src_stride;
+    ref0 += 2 * ref_stride;
+    ref1 += 2 * ref_stride;
+    ref2 += 2 * ref_stride;
+    ref3 += 2 * ref_stride;
+  }
+
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+#define SAD16XNX3_AVX2(n)                                                   \
+  void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res);          \
+  }
+#define SAD16XNX4_AVX2(n)                                                   \
+  void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res);          \
+  }
+
+SAD16XNX4_AVX2(32)
+SAD16XNX4_AVX2(16)
+SAD16XNX4_AVX2(8)
+
+SAD16XNX3_AVX2(32)
+SAD16XNX3_AVX2(16)
+SAD16XNX3_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD16XNX3_AVX2(64)
+SAD16XNX3_AVX2(4)
+
+SAD16XNX4_AVX2(64)
+SAD16XNX4_AVX2(4)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define SAD_SKIP_16XN_AVX2(n)                                                 \
+  void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref[4],             \
+                                     int ref_stride, uint32_t res[4]) {       \
+    aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
+                        res);                                                 \
+    res[0] <<= 1;                                                             \
+    res[1] <<= 1;                                                             \
+    res[2] <<= 1;                                                             \
+    res[3] <<= 1;                                                             \
+  }
+
+SAD_SKIP_16XN_AVX2(32)
+SAD_SKIP_16XN_AVX2(16)
+SAD_SKIP_16XN_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_16XN_AVX2(64)
+SAD_SKIP_16XN_AVX2(4)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index 9ab44c134..6de708b99 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -22,114 +22,95 @@ SECTION .text
   pavgb                 %2, m2
   lea                   second_predq, [second_predq+8]
 %endmacro
-; 'mflag' affect a lot how the code works.
+; 'spill_src_stride' affect a lot how the code works.
 ;
-; When 'mflag' is false, the 'src_strideq' resides in register,
-; [srcq + src_strideq + offset] is allowed, so we can simply
-; use such form to access src memory and don't bother to update
-; 'srcq' at each line. We only update 'srcq' each two-lines using
-; a compact LEA instruction like [srcq+src_strideq*2].
+; When 'spill_src_stride' is false, the 'src_strideq' resides in
+; register, [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update 'srcq'
+; at each line. We only update 'srcq' each two-lines using a compact
+; LEA instruction like [srcq+src_strideq*2].
 ;
-; When 'mflag' is true, the 'src_strideq' resides in memory.
+; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
 ; we cannot use above form to access memory, we have to update
 ; 'srcq' at each line break. As we process two parts (first,second)
 ; together in each macro function, the second part may also sit
 ; in the next line, which means we also need to possibly add
 ; one 'src_strideq' to 'srcq' before processing second part.
 
-%macro HANDLE_FIRST_OFFSET 2
-  %define first_offset %2
-  %if mflag == 0 && %1 == 1
-    %define first_offset (src_strideq + %2)
-  %endif
-%endmacro
-
-; first_extraline, second_extraline, in_line_offset
-%macro HANDLE_SECOND_OFFSET 3
-  %define second_offset %3
-  %if mflag && %1 == 0 && %2 == 1
+%macro HANDLE_SECOND_OFFSET 0
+  %if spill_src_stride
+    %define second_offset 0
     add srcq, src_strideq
-  %endif
-  %if mflag == 0 && %2 == 1
-    %define second_offset (src_strideq + %3)
+  %else
+    %define second_offset (src_strideq)
   %endif
 %endmacro
 
-; Notes for line_ending:
-; 0 -- not a line ending
-; 1 -- line ending of a odd line [line numbers starts from one]
-; 2 -- line ending of a even line
 ; This is specically designed to handle when src_strideq is a
 ; memory position, under such case, we can not accomplish
 ; complex address calculation using LEA, and fall back to
 ; using simple ADD instruction at each line ending.
-%macro ADVANCE_END_OF_LINE 1
-  %if mflag
+%macro ADVANCE_END_OF_TWO_LINES 0
+  %if spill_src_stride
     add srcq, src_strideq
-  %endif
-  %if mflag == 0 && %1 == 2
-    lea                 srcq, [srcq +src_strideq*2]
+  %else
+    lea                 srcq, [srcq+src_strideq*2]
   %endif
 
-  %if %1 == 2
-    lea                ref1q, [ref1q+ref_strideq*2]
-    lea                ref2q, [ref2q+ref_strideq*2]
-    lea                ref3q, [ref3q+ref_strideq*2]
-    lea                ref4q, [ref4q+ref_strideq*2]
-  %endif
+; note: ref_stride is never spilled when processing two lines
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
 %endmacro
 
-; Please note that the second_offset of src is for in_line_offset,
-; so it is less than src_stride.
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;               {first, second}_extraline, line_ending
-%macro PROCESS_4x2x4 9
-  HANDLE_FIRST_OFFSET   %7, %2
-  movd                  m0, [srcq + first_offset]
-  HANDLE_SECOND_OFFSET  %7, %8, %4
+; PROCESS_4x2x4 first, do_avg
+%macro PROCESS_4x2x4 2
+  movd                  m0, [srcq]
+  HANDLE_SECOND_OFFSET
 %if %1 == 1
-  movd                  m6, [ref1q+%3]
-  movd                  m4, [ref2q+%3]
-  movd                  m7, [ref3q+%3]
-  movd                  m5, [ref4q+%3]
+  movd                  m6, [ref1q]
+  movd                  m4, [ref2q]
+  movd                  m7, [ref3q]
+  movd                  m5, [ref4q]
 
   movd                  m1, [srcq + second_offset]
-  movd                  m2, [ref1q+%5]
+  movd                  m2, [ref1q+ref_strideq]
   punpckldq             m0, m1
   punpckldq             m6, m2
-  movd                  m1, [ref2q+%5]
-  movd                  m2, [ref3q+%5]
-  movd                  m3, [ref4q+%5]
+  movd                  m1, [ref2q+ref_strideq]
+  movd                  m2, [ref3q+ref_strideq]
+  movd                  m3, [ref4q+ref_strideq]
   punpckldq             m4, m1
   punpckldq             m7, m2
   punpckldq             m5, m3
   movlhps               m0, m0
   movlhps               m6, m4
   movlhps               m7, m5
-%if %6 == 1
+%if %2 == 1
   AVG_4x2x4             m6, m7
 %endif
   psadbw                m6, m0
   psadbw                m7, m0
 %else
-  movd                  m1, [ref1q+%3]
-  movd                  m5, [ref1q+%5]
-  movd                  m2, [ref2q+%3]
-  movd                  m4, [ref2q+%5]
+  movd                  m1, [ref1q]
+  movd                  m5, [ref1q+ref_strideq]
+  movd                  m2, [ref2q]
+  movd                  m4, [ref2q+ref_strideq]
   punpckldq             m1, m5
   punpckldq             m2, m4
-  movd                  m3, [ref3q+%3]
-  movd                  m5, [ref3q+%5]
+  movd                  m3, [ref3q]
+  movd                  m5, [ref3q+ref_strideq]
   punpckldq             m3, m5
-  movd                  m4, [ref4q+%3]
-  movd                  m5, [ref4q+%5]
+  movd                  m4, [ref4q]
+  movd                  m5, [ref4q+ref_strideq]
   punpckldq             m4, m5
   movd                  m5, [srcq + second_offset]
   punpckldq             m0, m5
   movlhps               m0, m0
   movlhps               m1, m2
   movlhps               m3, m4
-%if %6 == 1
+%if %2 == 1
   AVG_4x2x4             m1, m3
 %endif
   psadbw                m1, m0
@@ -137,28 +118,23 @@ SECTION .text
   paddd                 m6, m1
   paddd                 m7, m3
 %endif
-%if %9 > 0
-  ADVANCE_END_OF_LINE %9
-%endif
 %endmacro
 
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;               {first,second}_extraline, line_ending
-%macro PROCESS_8x2x4 9
-  HANDLE_FIRST_OFFSET   %7, %2
-  movh                  m0, [srcq + first_offset]
-  HANDLE_SECOND_OFFSET  %7, %8, %4
+; PROCESS_8x2x4 first, do_avg
+%macro PROCESS_8x2x4 2
+  movh                  m0, [srcq]
+  HANDLE_SECOND_OFFSET
 %if %1 == 1
-  movh                  m4, [ref1q+%3]
-  movh                  m5, [ref2q+%3]
-  movh                  m6, [ref3q+%3]
-  movh                  m7, [ref4q+%3]
+  movh                  m4, [ref1q]
+  movh                  m5, [ref2q]
+  movh                  m6, [ref3q]
+  movh                  m7, [ref4q]
   movhps                m0, [srcq + second_offset]
-  movhps                m4, [ref1q+%5]
-  movhps                m5, [ref2q+%5]
-  movhps                m6, [ref3q+%5]
-  movhps                m7, [ref4q+%5]
-%if %6 == 1
+  movhps                m4, [ref1q+ref_strideq]
+  movhps                m5, [ref2q+ref_strideq]
+  movhps                m6, [ref3q+ref_strideq]
+  movhps                m7, [ref4q+ref_strideq]
+%if %2 == 1
   movu                  m3, [second_predq]
   pavgb                 m4, m3
   pavgb                 m5, m3
@@ -171,12 +147,12 @@ SECTION .text
   psadbw                m6, m0
   psadbw                m7, m0
 %else
-  movh                  m1, [ref1q+%3]
-  movh                  m2, [ref2q+%3]
+  movh                  m1, [ref1q]
+  movh                  m2, [ref2q]
   movhps                m0, [srcq + second_offset]
-  movhps                m1, [ref1q+%5]
-  movhps                m2, [ref2q+%5]
-%if %6 == 1
+  movhps                m1, [ref1q+ref_strideq]
+  movhps                m2, [ref2q+ref_strideq]
+%if %2 == 1
   movu                  m3, [second_predq]
   pavgb                 m1, m3
   pavgb                 m2, m3
@@ -186,11 +162,11 @@ SECTION .text
   paddd                 m4, m1
   paddd                 m5, m2
 
-  movh                  m1, [ref3q+%3]
-  movhps                m1, [ref3q+%5]
-  movh                  m2, [ref4q+%3]
-  movhps                m2, [ref4q+%5]
-%if %6 == 1
+  movh                  m1, [ref3q]
+  movhps                m1, [ref3q+ref_strideq]
+  movh                  m2, [ref4q]
+  movhps                m2, [ref4q+ref_strideq]
+%if %2 == 1
   pavgb                 m1, m3
   pavgb                 m2, m3
   lea                   second_predq, [second_predq+mmsize]
@@ -200,24 +176,16 @@ SECTION .text
   paddd                 m6, m1
   paddd                 m7, m2
 %endif
-%if %9 > 0
-  ADVANCE_END_OF_LINE %9
-%endif
 %endmacro
 
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                {first,second}_extraline, line_ending
-%macro PROCESS_16x2x4 9
-  ; 1st 16 px
-  HANDLE_FIRST_OFFSET   %7, %2
-  mova                  m0, [srcq + first_offset]
-  HANDLE_SECOND_OFFSET  %7, %8, %4
+; PROCESS_FIRST_MMSIZE do_avg
+%macro PROCESS_FIRST_MMSIZE 1
+  mova                  m0, [srcq]
+  movu                  m4, [ref1q]
+  movu                  m5, [ref2q]
+  movu                  m6, [ref3q]
+  movu                  m7, [ref4q]
 %if %1 == 1
-  movu                  m4, [ref1q+%3]
-  movu                  m5, [ref2q+%3]
-  movu                  m6, [ref3q+%3]
-  movu                  m7, [ref4q+%3]
-%if %6 == 1
   movu                  m3, [second_predq]
   pavgb                 m4, m3
   pavgb                 m5, m3
@@ -229,38 +197,14 @@ SECTION .text
   psadbw                m5, m0
   psadbw                m6, m0
   psadbw                m7, m0
-%else ; %1 == 1
-  movu                  m1, [ref1q+%3]
-  movu                  m2, [ref2q+%3]
-%if %6 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m4, m1
-  paddd                 m5, m2
-
-  movu                  m1, [ref3q+%3]
-  movu                  m2, [ref4q+%3]
-%if %6 == 1
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m6, m1
-  paddd                 m7, m2
-%endif ; %1 == 1
-
-  ; 2nd 16 px
-  mova                  m0, [srcq + second_offset]
-  movu                  m1, [ref1q+%5]
-  movu                  m2, [ref2q+%5]
+%endmacro
 
-%if %6 == 1
+; PROCESS_16x1x4 offset, do_avg
+%macro PROCESS_16x1x4 2
+  mova                  m0, [srcq + %1]
+  movu                  m1, [ref1q + ref_offsetq + %1]
+  movu                  m2, [ref2q + ref_offsetq + %1]
+%if %2 == 1
   movu                  m3, [second_predq]
   pavgb                 m1, m3
   pavgb                 m2, m3
@@ -270,14 +214,9 @@ SECTION .text
   paddd                 m4, m1
   paddd                 m5, m2
 
-  movu                  m1, [ref3q+%5]
-  movu                  m2, [ref4q+%5]
-
-%if %9 > 0
-  ADVANCE_END_OF_LINE %9
-%endif
-
-%if %6 == 1
+  movu                  m1, [ref3q + ref_offsetq + %1]
+  movu                  m2, [ref4q + ref_offsetq + %1]
+%if %2 == 1
   pavgb                 m1, m3
   pavgb                 m2, m3
   lea                   second_predq, [second_predq+mmsize]
@@ -288,27 +227,6 @@ SECTION .text
   paddd                 m7, m2
 %endmacro
 
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                {first,second}_extraline, line_ending
-%macro PROCESS_32x2x4 9
-  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7
-  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9
-%endmacro
-
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                {first,second}_extraline, line_ending
-%macro PROCESS_64x2x4 9
-  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7
-  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9
-%endmacro
-
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                 {first,second}_extraline, line_ending
-%macro PROCESS_128x2x4 9
-  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7
-  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9
-%endmacro
-
 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
@@ -318,38 +236,118 @@ SECTION .text
 ;   3: If 0, then normal sad, else avg
 ;   4: If 0, then normal sad, else skip rows
 %macro SADNXN4D 2-4 0,0
+
+%define spill_src_stride 0
+%define spill_ref_stride 0
+%define spill_cnt 0
+
+; Whether a shared offset should be used instead of adding strides to
+; each reference array. With this option, only one line will be processed
+; per loop iteration.
+%define use_ref_offset (%1 >= mmsize)
+
+; Remove loops in the 4x4 and 8x4 case
+%define use_loop (use_ref_offset || %2 > 4)
+
 %if %4 == 1  ; skip rows
 %if ARCH_X86_64
-cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
+                                     ref2, ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
+                                    ref2, ref3, ref4, cnt
+%else
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
+                                    ref2, ref3, ref4
+%endif
 %else
-cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
+                                    ref4
+%define spill_src_stride 1
+%define spill_ref_stride 1
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
+                                    ref3, ref4
+%define spill_src_stride 1
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
+                                    ref3, ref4
+%endif
 %endif
 %elif %3 == 0  ; normal sad
 %if ARCH_X86_64
-cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
+%if use_ref_offset
+cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+                               ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+                              ref3, ref4, cnt
+%else
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+                              ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
+  %define spill_src_stride 1
+  %define spill_ref_stride 1
+%elif use_loop
+cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
+  %define spill_src_stride 1
 %else
-cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
+                              ref4
+%endif
 %endif
 %else ; avg
 %if ARCH_X86_64
+%if use_ref_offset
+cglobal sad%1x%2x4d_avg, 6, 11, 8, src, src_stride, ref1, ref_stride, \
+                                   second_pred, res, ref2, ref3, ref4, cnt, \
+                                   ref_offset
+%elif use_loop
 cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
-                                  second_pred, res, ref2, ref3, ref4
+                                   second_pred, res, ref2, ref3, ref4, cnt
+%else
+cglobal sad%1x%2x4d_avg, 6, 9, 8, src, src_stride, ref1, ref_stride, \
+                                   second_pred, res, ref2, ref3, ref4
+%endif
 %else
-cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
-                                  second_pred, ref2, ref3
+%if use_ref_offset
+cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_offset, second_pred, ref2, ref3
+  %define spill_src_stride 1
+  %define spill_ref_stride 1
+  %define spill_cnt 1
+%elif use_loop
+cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, second_pred, ref2, ref3
+  %define spill_src_stride 1
+  %define spill_cnt 1
+%else
+cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, second_pred, ref2, ref3
+  %define spill_src_stride 1
+%endif
+%endif
+%endif
+
+%if spill_src_stride
   %define src_strideq r1mp
   %define src_strided r1mp
 %endif
+%if spill_ref_stride
+  %define ref_strideq r3mp
+  %define ref_strided r3mp
+%endif
+
+%if spill_cnt
+  SUB                  rsp, 4
+  %define cntd word [rsp]
 %endif
 
-  %define mflag ((1 - ARCH_X86_64) & %3)
 %if %4 == 1
-  lea          src_strided, [2*src_strided]
-  lea          ref_strided, [2*ref_strided]
+  sal          src_strided, 1
+  sal          ref_strided, 1
 %endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
@@ -359,18 +357,67 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
   mov                ref4q, [ref1q+gprsize*3]
   mov                ref1q, [ref1q+gprsize*0]
 
-  PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
-%if %4 == 1  ; downsample number of rows by 2
-%define num_rep (%2-8)/4
+; Is the loop for this wxh in another function?
+; If so, we jump into that function for the loop and returning
+%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
+
+%if use_ref_offset
+  PROCESS_FIRST_MMSIZE %3
+%if %1 > mmsize
+  mov          ref_offsetq, 0
+  mov                 cntd, %2 >> %4
+; Jump part way into the loop for the square version of this width
+%if %3 == 1
+  jmp mangle(private_prefix %+ _sad%1x%1x4d_avg %+ SUFFIX).midloop
+%elif %4 == 1
+  jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
 %else
-%define num_rep (%2-4)/2
+  jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
+%endif
+%else
+  mov          ref_offsetq, ref_strideq
+  add                 srcq, src_strideq
+  mov                 cntd, (%2 >> %4) - 1
+%endif
+%if external_loop == 0
+.loop:
+; Unrolled horizontal loop
+%assign h_offset 0
+%rep %1/mmsize
+  PROCESS_16x1x4 h_offset, %3
+%if h_offset == 0
+; The first row of the first column is done outside the loop and jumps here
+.midloop:
 %endif
-%rep num_rep
-  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
+%assign h_offset h_offset+mmsize
 %endrep
-%undef num_rep
-  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 
+  add                 srcq, src_strideq
+  add          ref_offsetq, ref_strideq
+  sub                 cntd, 1
+  jnz .loop
+%endif
+%else
+  PROCESS_%1x2x4 1, %3
+  ADVANCE_END_OF_TWO_LINES
+%if use_loop
+  mov                 cntd, (%2/2 >> %4) - 1
+.loop:
+%endif
+  PROCESS_%1x2x4 0, %3
+%if use_loop
+  ADVANCE_END_OF_TWO_LINES
+  sub                 cntd, 1
+  jnz .loop
+%endif
+%endif
+
+%if spill_cnt
+; Undo stack allocation for cnt
+  ADD                  rsp, 4
+%endif
+
+%if external_loop == 0
 %if %3 == 0
   %define resultq r4
   %define resultmp r4mp
@@ -379,6 +426,16 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
   %define resultmp r5mp
 %endif
 
+; Undo modifications on parameters on the stack
+%if %4 == 1
+%if spill_src_stride
+  shr          src_strided, 1
+%endif
+%if spill_ref_stride
+  shr          ref_strided, 1
+%endif
+%endif
+
 %if %1 > 4
   pslldq                m5, 4
   pslldq                m7, 4
@@ -407,6 +464,7 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
   movq              [resultq+8], m7
   RET
 %endif
+%endif ; external_loop == 0
 %endmacro
 
 INIT_XMM sse2
diff --git a/aom_dsp/x86/subtract_avx2.c b/aom_dsp/x86/subtract_avx2.c
index 40831600a..b4c5cc7c7 100644
--- a/aom_dsp/x86/subtract_avx2.c
+++ b/aom_dsp/x86/subtract_avx2.c
@@ -16,14 +16,15 @@ static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
                                    const uint8_t *pred_ptr) {
   __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
   __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
-  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
-  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
-  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
-  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
-  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
-  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
+  __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01);
+  __m256i diff0 = _mm256_unpacklo_epi8(s, p);
+  __m256i diff1 = _mm256_unpackhi_epi8(s, p);
+  diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone);
+  diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone);
+  _mm256_store_si256((__m256i *)(diff_ptr),
+                     _mm256_permute2x128_si256(diff0, diff1, 0x20));
+  _mm256_store_si256((__m256i *)(diff_ptr + 16),
+                     _mm256_permute2x128_si256(diff0, diff1, 0x31));
 }
 
 static INLINE void subtract_block_16xn_avx2(
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index 0d63db288..89b9b824b 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -21,7 +21,7 @@ static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
                                                 int width, int height) {
   uint64_t result;
   __m256i v_acc_q = _mm256_setzero_si256();
-  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+  const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
   for (int col = 0; col < height; col += 4) {
     __m256i v_acc_d = _mm256_setzero_si256();
     for (int row = 0; row < width; row += 16) {
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index 0bdeee9f2..25be8568a 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -84,7 +84,7 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
     src += stride << 2;
     r += 4;
   } while (r < height);
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
   __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
                                    _mm_and_si128(v_acc_q, v_zext_mask_q));
   v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
@@ -116,7 +116,7 @@ aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                 int height) {
   int r = 0;
 
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
   __m128i v_acc_q = _mm_setzero_si128();
 
   do {
@@ -254,7 +254,7 @@ uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
 //////////////////////////////////////////////////////////////////////////////
 
 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
   __m128i v_acc0_q = _mm_setzero_si128();
   __m128i v_acc1_q = _mm_setzero_si128();
 
@@ -306,7 +306,7 @@ uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
   if (n % 64 == 0) {
     return aom_sum_squares_i16_64n_sse2(src, n);
   } else if (n > 64) {
-    int k = n & ~(64 - 1);
+    const uint32_t k = n & ~63u;
     return aom_sum_squares_i16_64n_sse2(src, k) +
            aom_sum_squares_i16_c(src + k, n - k);
   } else {
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 7398a73b0..a475fb72a 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -234,105 +234,6 @@ unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
-
-#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
-  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2);                                                           \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
-  }
-
-AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7)
-AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6)
-AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7)
-AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6)
-AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5)
-AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6)
-AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5)
-AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4)
-AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5)
-AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4)
-AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3)
-#if !CONFIG_REALTIME_ONLY
-AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6)
-AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2)
-#endif
-
-#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
-      const uint8_t *sec) {                                               \
-    /*Avoid overflow in helper by capping height.*/                       \
-    const int hf = AOMMIN(h, 64);                                         \
-    unsigned int sse = 0;                                                 \
-    int se = 0;                                                           \
-    for (int i = 0; i < (w / wf); ++i) {                                  \
-      const uint8_t *src_ptr = src;                                       \
-      const uint8_t *dst_ptr = dst;                                       \
-      const uint8_t *sec_ptr = sec;                                       \
-      for (int j = 0; j < (h / hf); ++j) {                                \
-        unsigned int sse2;                                                \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
-            sec_ptr, w, hf, &sse2);                                       \
-        dst_ptr += hf * dst_stride;                                       \
-        src_ptr += hf * src_stride;                                       \
-        sec_ptr += hf * w;                                                \
-        se += se2;                                                        \
-        sse += sse2;                                                      \
-      }                                                                   \
-      src += wf;                                                          \
-      dst += wf;                                                          \
-      sec += wf;                                                          \
-    }                                                                     \
-    *sse_ptr = sse;                                                       \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
-  }
-
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5)
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4)
-
 static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
   const __m256i d =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
@@ -534,15 +435,15 @@ uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
   __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
   __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
-  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i res0_4x64, res1_4x64;
   __m256i sub_result;
   const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
   __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
   for (int i = 0; i < h; i += 4) {
-    dst0_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
-    dst1_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
-    dst2_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 2) * dstride]));
-    dst3_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 3) * dstride]));
+    dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+    dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+    dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
+    dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
     dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
                                   _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
     dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
@@ -557,30 +458,121 @@ uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
         _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
     src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
 
+    // r15 r14 r13------------r1 r0  - 16 bit
     sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
 
-    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
-    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+    src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+    // accumulation of result
+    square_result = _mm256_add_epi32(square_result, src_16x16);
+  }
 
-    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);  // 32bit store
-    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);  // 32bit store
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7 s6 s3 s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+  return sum;
+}
 
-    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
-    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
-    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
-    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+// Compute mse of four consecutive 4x4 blocks.
+// In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                     int src_blk_stride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
+  __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
+  __m256i res0_4x64, res1_4x64;
+  __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = zeros;
+  uint16_t *src_temp = src;
 
-    square_result = _mm256_add_epi64(
-        square_result,
-        _mm256_add_epi64(
-            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
-            res3_4x64));
+  for (int i = 0; i < h; i += 4) {
+    dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+    dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+    dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride]));
+    dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride]));
+
+    // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30
+    dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+    // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31
+    dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+    // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32
+    dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8);
+    // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33
+    dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+    // All rows of 1st 4x4 block - r00 r01 r02 r03
+    __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+    // All rows of 2nd 4x4 block - r10 r11 r12 r13
+    __m256i src1_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+    // All rows of 3rd 4x4 block - r20 r21 r22 r23
+    __m256i src2_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride]));
+    // All rows of 4th 4x4 block - r30 r31 r32 r33
+    __m256i src3_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride]));
+
+    // r00 r10 r02 r12
+    __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16);
+    // r01 r11 r03 r13
+    __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16);
+    // r20 r30 r22 r32
+    __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16);
+    // r21 r31 r23 r33
+    __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16);
+
+    // r00 r10 r20 r30
+    src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20);
+    // r01 r11 r21 r31
+    src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20);
+    // r02 r12 r22 r32
+    src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31);
+    // r03 r13 r23 r33
+    src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31);
+
+    // r15 r14 r13------------r1 r0  - 16 bit
+    sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16));
+    sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16));
+    sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16));
+    sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16));
+
+    // s7 s6 s5 s4 s3 s2 s1 s0    - 32bit
+    src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+    src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+    src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2);
+    src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3);
+
+    // accumulation of result
+    src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+    src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16);
+    const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16);
+    square_result = _mm256_add_epi32(square_result, square_result_0);
+    src_temp += 16;
   }
-  const __m128i sum_2x64 =
-      _mm_add_epi64(_mm256_castsi256_si128(square_result),
-                    _mm256_extracti128_si256(square_result, 1));
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-  xx_storel_64(&sum, sum_1x64);
+
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7  s6  s3  s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
   return sum;
 }
 
@@ -589,7 +581,7 @@ uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   uint64_t sum = 0;
   __m128i dst0_8x8, dst1_8x8, dst3_16x8;
   __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
-  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i res0_4x64, res1_4x64;
   __m256i sub_result;
   const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
   __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
@@ -606,38 +598,98 @@ uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
         _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
     src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
 
+    // r15 r14 r13 - - - r1 r0 - 16 bit
     sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
 
-    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
-    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+    src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
 
-    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
-    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+    // accumulation of result
+    square_result = _mm256_add_epi32(square_result, src_16x16);
+  }
 
-    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
-    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
-    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
-    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7 s6 s3 s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+  return sum;
+}
 
-    square_result = _mm256_add_epi64(
-        square_result,
-        _mm256_add_epi64(
-            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
-            res3_4x64));
+// Compute mse of two consecutive 8x8 blocks.
+// In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                     int src_blk_stride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_16x8, dst1_16x8;
+  __m256i dst0_16x16, dst1_16x16;
+  __m256i res0_4x64, res1_4x64;
+  __m256i sub_result_0, sub_result_1;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = zeros;
+  uint16_t *src_temp = src;
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+    dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+
+    // row0 of 1st and 2nd 8x8 block - d00 d10
+    dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+    // row1 of 1st and 2nd 8x8 block - d01 d11
+    dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+
+    // 2 rows of 1st 8x8 block - r00 r01
+    __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+    // 2 rows of 2nd 8x8 block - r10 r11
+    __m256i src1_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+    // r00 r10 - 128bit
+    __m256i tmp0_16x16 =
+        _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20);
+    // r01 r11 - 128bit
+    __m256i tmp1_16x16 =
+        _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31);
+
+    // r15 r14 r13------------r1 r0 - 16 bit
+    sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16));
+    sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16));
+
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each
+    src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+    src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+
+    // accumulation of result
+    src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+    square_result = _mm256_add_epi32(square_result, src0_16x16);
+    src_temp += 16;
   }
 
-  const __m128i sum_2x64 =
-      _mm_add_epi64(_mm256_castsi256_si128(square_result),
-                    _mm256_extracti128_si256(square_result, 1));
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-  xx_storel_64(&sum, sum_1x64);
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7 s6 s3 s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
   return sum;
 }
 
 uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
                                 int sstride, int w, int h) {
   assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
-         "w=8/4 and h=8/4 must satisfy");
+         "w=8/4 and h=8/4 must be satisfied");
   switch (w) {
     case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
     case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
@@ -645,36 +697,34 @@ uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   }
 }
 
-static INLINE void sum_final_256bit_avx2(__m256i sum_8x16[2], int *const sum) {
-  const __m256i sum_result_0 = _mm256_hadd_epi16(sum_8x16[0], sum_8x16[1]);
-  const __m256i sum_result_1 =
-      _mm256_add_epi16(_mm256_srli_si256(sum_result_0, 4), sum_result_0);
-  const __m256i sum_result_2 =
-      _mm256_add_epi16(_mm256_srli_si256(sum_result_1, 2), sum_result_1);
-  const __m128i sum_128_high = _mm256_extractf128_si256(sum_result_2, 1);
-  const __m128i sum_result_3 =
-      _mm_unpacklo_epi16(_mm256_castsi256_si128(sum_result_2), sum_128_high);
-  const __m128i sum_result_4 =
-      _mm_unpackhi_epi16(_mm256_castsi256_si128(sum_result_2), sum_128_high);
-  const __m128i sum_result_5 = _mm_unpacklo_epi32(sum_result_3, sum_result_4);
-
-  _mm_storeu_si128((__m128i *)sum, _mm_cvtepi16_epi32(sum_result_5));
-}
-
-static INLINE void calc_sum_sse_for_8x32_block_avx2(const uint8_t *src,
-                                                    const uint8_t *ref,
-                                                    __m256i sse_8x16[2],
-                                                    __m256i sum_8x16[2]) {
-  const __m256i s0_256 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)src));
-  const __m256i r0_256 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)ref));
-  const __m256i s1_256 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(src + 16)));
-  const __m256i r1_256 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(ref + 16)));
-  const __m256i diff0 = _mm256_sub_epi16(s0_256, r0_256);
-  const __m256i diff1 = _mm256_sub_epi16(s1_256, r1_256);
+// Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8
+// block and Chroma uses 4x4 block. In src buffer, each block in a filter block
+// is stored sequentially. Hence src_blk_stride is same as block width. Whereas
+// dst buffer is a frame buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must be satisfied");
+  switch (w) {
+    case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
+    case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
+
+static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src,
+                                          const uint8_t *ref,
+                                          __m256i set_one_minusone,
+                                          __m256i sse_8x16[2],
+                                          __m256i sum_8x16[2]) {
+  const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref));
+
+  const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256);
+  const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256);
+
+  const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone);
+  const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone);
 
   sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0));
   sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1));
@@ -682,48 +732,141 @@ static INLINE void calc_sum_sse_for_8x32_block_avx2(const uint8_t *src,
   sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
 }
 
-static INLINE void get_sse_sum_8x8_quad_avx2(const uint8_t *src,
-                                             const int src_stride,
-                                             const uint8_t *ref,
-                                             const int ref_stride, const int h,
-                                             unsigned int *const sse,
-                                             int *const sum) {
+static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
+                                         unsigned int *tot_sse, int *tot_sum) {
+  // s00 s01 s10 s11 s20 s21 s30 s31
+  const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]);
+  // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33
+  const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]);
+  // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31
+  const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0);
+  // d00 d01 d10 d11 d20 d21 d30 d31 | X
+  const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08);
+  // d00 d01 d10 d11 d20 d21 d30 d31
+  const __m256i sum_results =
+      _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3));
+
+  // Add sum & sse registers appropriately to get total sum & sse separately.
+  // s0 s1 d0 d1 s2 s3 d2 d3
+  const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results);
+  // s0 s1 s2 s3 d0 d1 d2 d3
+  const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8);
+  // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3
+  const __m256i sum_sse_order_add_1 =
+      _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+  // s0 x x x | d0 x x x
+  const __m256i sum_sse_order_add_final =
+      _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1);
+  // s0
+  const uint32_t first_value =
+      (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0);
+  *tot_sse += first_value;
+  // d0
+  const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4);
+  *tot_sum += second_value;
+  return sum_sse_order_add;
+}
+
+static INLINE void get_var_sse_sum_8x8_quad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref,
+    const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8,
+    unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) {
   assert(h <= 128);  // May overflow for larger height.
   __m256i sse_8x16[2], sum_8x16[2];
   sum_8x16[0] = _mm256_setzero_si256();
   sse_8x16[0] = _mm256_setzero_si256();
   sum_8x16[1] = sum_8x16[0];
   sse_8x16[1] = sse_8x16[0];
+  const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
 
-  for (int i = 0; i < h; i += 2) {
-    // Process 8x32 block of first row.
-    calc_sum_sse_for_8x32_block_avx2(src, ref, sse_8x16, sum_8x16);
+  for (int i = 0; i < h; i++) {
+    // Process 8x32 block of one row.
+    calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16);
+    src += src_stride;
+    ref += ref_stride;
+  }
 
-    // Process 8x32 block of second row.
-    calc_sum_sse_for_8x32_block_avx2(src + src_stride, ref + ref_stride,
-                                     sse_8x16, sum_8x16);
+  const __m256i sum_sse_order_add =
+      calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum);
+
+  // s0 s1 s2 s3
+  _mm_storeu_si128((__m128i *)sse8x8,
+                   _mm256_castsi256_si128(sum_sse_order_add));
+  // d0 d1 d2 d3
+  const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1);
+  _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8);
+
+  // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+  const __m128i mull_results =
+      _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6);
+  // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+  const __m128i variance_8x8 =
+      _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results);
+  // v0 v1 v2 v3
+  _mm_storeu_si128((__m128i *)var8x8, variance_8x8);
+}
+
+static INLINE void get_var_sse_sum_16x16_dual_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref,
+    const int ref_stride, const int h, uint32_t *sse16x16,
+    unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) {
+  assert(h <= 128);  // May overflow for larger height.
+  __m256i sse_16x16[2], sum_16x16[2];
+  sum_16x16[0] = _mm256_setzero_si256();
+  sse_16x16[0] = _mm256_setzero_si256();
+  sum_16x16[1] = sum_16x16[0];
+  sse_16x16[1] = sse_16x16[0];
+  const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
 
-    src += src_stride << 1;
-    ref += ref_stride << 1;
+  for (int i = 0; i < h; i++) {
+    // Process 16x32 block of one row.
+    calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16);
+    src += src_stride;
+    ref += ref_stride;
   }
 
-  // Add sse registers appropriately to get each 8x8 block sse separately.
-  const __m256i sse_result_1 = _mm256_hadd_epi32(sse_8x16[0], sse_8x16[1]);
-  const __m256i sse_result_2 =
-      _mm256_hadd_epi32(sse_result_1, _mm256_setzero_si256());
-  const __m256i sse_result_3 = _mm256_permute4x64_epi64(sse_result_2, 0xd8);
+  const __m256i sum_sse_order_add =
+      calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum);
 
-  _mm_storeu_si128(
-      (__m128i *)sse,
-      _mm_shuffle_epi32(_mm256_castsi256_si128(sse_result_3), 0xd8));
+  const __m256i sum_sse_order_add_1 =
+      _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+
+  // s0+s1 s2+s3 x x
+  _mm_storel_epi64((__m128i *)sse16x16,
+                   _mm256_castsi256_si128(sum_sse_order_add_1));
+
+  // d0+d1 d2+d3 x x
+  const __m128i sum_temp16x16 =
+      _mm256_extractf128_si256(sum_sse_order_add_1, 1);
+
+  // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+  const __m128i mull_results =
+      _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8);
+
+  // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+  const __m128i variance_16x16 =
+      _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results);
+
+  // v0 v1 v2 v3
+  _mm_storel_epi64((__m128i *)var16x16, variance_16x16);
+}
 
-  // Add sum registers appropriately to get each 8x8 block sum separately.
-  sum_final_256bit_avx2(sum_8x16, sum);
+void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr,
+                                       int source_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse8x8, int *sum8x8,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var8x8) {
+  get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8,
+                                sse8x8, sum8x8, tot_sse, tot_sum, var8x8);
 }
 
-void aom_get_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   unsigned int *sse, int *sum) {
-  get_sse_sum_8x8_quad_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse,
-                            sum);
+void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr,
+                                         int source_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         uint32_t *sse16x16,
+                                         unsigned int *tot_sse, int *tot_sum,
+                                         uint32_t *var16x16) {
+  get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride,
+                                  16, sse16x16, tot_sse, tot_sum, var16x16);
 }
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index 163e4cc56..9e9e70ea0 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -163,652 +163,762 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
   src_lo = _mm_srai_epi16(src_lo, 4);         \
   src_hi = _mm_srai_epi16(src_hi, 4);
 
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
+// TODO(chiyotsai@google.com): These variance functions are macro-fied so we
+// don't have to manually optimize the individual for-loops. We could save some
+// binary size by optimizing the loops more carefully without duplicating the
+// codes with a macro.
+#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                           \
+  static AOM_INLINE int aom_sub_pixel_variance32x##height##_imp_avx2(         \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
+    __m256i zero_reg;                                                         \
+    int i, sum;                                                               \
+    sum_reg = _mm256_setzero_si256();                                         \
+    sse_reg = _mm256_setzero_si256();                                         \
+    zero_reg = _mm256_setzero_si256();                                        \
+                                                                              \
+    /* x_offset = 0 and y_offset = 0 */                                       \
+    if (x_offset == 0) {                                                      \
+      if (y_offset == 0) {                                                    \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = 4 */                                   \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, src_stride)                                   \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = bilin interpolation */                 \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg;                                    \
+                                                                              \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = 4  and y_offset = 0 */                                    \
+    } else if (x_offset == 4) {                                               \
+      if (y_offset == 0) {                                                    \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = 4 */                                  \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg, src_avg;                                        \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          /* average between previous average to current average */           \
+          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
+          /* save current source average */                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = bilin interpolation */                \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg, src_avg;                           \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          /* save current source average */                                   \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          MERGE_WITH_SRC(src_avg, src_reg)                                    \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = bilin interpolation and y_offset = 0 */                   \
+    } else {                                                                  \
+      if (y_offset == 0) {                                                    \
+        __m256i filter, pw8, src_next_reg;                                    \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = 4 */                 \
+      } else if (y_offset == 4) {                                             \
+        __m256i filter, pw8, src_next_reg, src_pack;                          \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+        FILTER_SRC(filter)                                                    \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* average between previous pack to the current */                  \
+          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src_pack = src_reg;                                                 \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
+         */                                                                   \
+      } else {                                                                \
+        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
+        x_offset <<= 5;                                                       \
+        xfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        y_offset <<= 5;                                                       \
+        yfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+                                                                              \
+        FILTER_SRC(xfilter)                                                   \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(xfilter)                                                 \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* merge previous pack to current pack source */                    \
+          MERGE_WITH_SRC(src_pack, src_reg)                                   \
+          /* filter the source */                                             \
+          FILTER_SRC(yfilter)                                                 \
+          src_pack = src_reg;                                                 \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    CALC_SUM_AND_SSE                                                          \
+    _mm256_zeroupper();                                                       \
+    return sum;                                                               \
+  }                                                                           \
+  unsigned int aom_sub_pixel_variance32x##height##_avx2(                      \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
+    const int sum = aom_sub_pixel_variance32x##height##_imp_avx2(             \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sse);           \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
+  }
 
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 4  and y_offset = 0
-  } else if (x_offset == 4) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 4  and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 4  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
+MAKE_SUB_PIXEL_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_VAR_32XH(16, 4)
 
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                    \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {        \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2(     \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            &sse2);                                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
   }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
 
-unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)
 
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i += 2) {
-        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += (src_stride << 1);
-        dst += (dst_stride << 1);
-      }
-      // x_offset = 0 and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i += 2) {
-        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
-        AVG_NEXT_SRC_INSERT(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += (src_stride << 1);
-        dst += (dst_stride << 1);
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i += 2) {
-        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
-        MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += (src_stride << 1);
-        dst += (dst_stride << 1);
-      }
-    }
-    // x_offset = 4  and y_offset = 0
-  } else if (x_offset == 4) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i += 2) {
-        LOAD_SRC_NEXT_BYTE_INSERT
-        LOAD_DST_INSERT
-        /* average between current and next stride source */
-        src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += (src_stride << 1);
-        dst += (dst_stride << 1);
-      }
-      // x_offset = 4  and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i src_next_reg, src_avg, src_temp;
-      // load and insert source and next row source
-      LOAD_SRC_NEXT_BYTE_INSERT
-      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
-      src += src_stride << 1;
-      for (i = 0; i < height - 2; i += 2) {
-        LOAD_SRC_NEXT_BYTE_INSERT
-        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
-        src_temp = _mm256_avg_epu8(src_avg, src_temp);
-        LOAD_DST_INSERT
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_temp, zero_reg)
-        // save current source average
-        src_avg = src_next_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride << 1;
-        src += src_stride << 1;
-      }
-      // last 2 rows processing happens here
-      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
-      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
-      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
-      src_next_reg = _mm256_permute2x128_si256(
-          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
-      LOAD_DST_INSERT
-      src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
-      MERGE_WITH_SRC(src_avg, zero_reg)
-      CALC_SUM_SSE_INSIDE_LOOP
-    } else {
-      // x_offset = 4  and y_offset = bilin interpolation
-      __m256i filter, pw8, src_next_reg, src_avg, src_temp;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load and insert source and next row source
-      LOAD_SRC_NEXT_BYTE_INSERT
-      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
-      src += src_stride << 1;
-      for (i = 0; i < height - 2; i += 2) {
-        LOAD_SRC_NEXT_BYTE_INSERT
-        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
-        LOAD_DST_INSERT
-        MERGE_WITH_SRC(src_avg, src_temp)
-        // save current source average
-        src_avg = src_next_reg;
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride << 1;
-        src += src_stride << 1;
-      }
-      // last 2 rows processing happens here
-      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
-      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
-      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
-      src_next_reg = _mm256_permute2x128_si256(
-          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
-      LOAD_DST_INSERT
-      MERGE_WITH_SRC(src_avg, src_next_reg)
-      FILTER_SRC(filter)
-      CALC_SUM_SSE_INSIDE_LOOP
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i += 2) {
-        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
-        MERGE_NEXT_SRC_INSERT(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += (src_stride << 1);
-        dst += (dst_stride << 1);
-      }
-      // x_offset = bilin interpolation and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load and insert source and next row source
-      LOAD_SRC_NEXT_BYTE_INSERT
-      MERGE_WITH_SRC(src_reg, src_next_reg)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      src += src_stride << 1;
-      for (i = 0; i < height - 2; i += 2) {
-        LOAD_SRC_NEXT_BYTE_INSERT
-        LOAD_DST_INSERT
-        MERGE_WITH_SRC(src_reg, src_next_reg)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        src += src_stride << 1;
-        dst += dst_stride << 1;
-      }
-      // last 2 rows processing happens here
-      LOAD_SRC_MERGE_128BIT(filter)
-      LOAD_DST_INSERT
-      FILTER_SRC_128BIT(filter_128bit)
-      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
-      src_next_reg = _mm256_permute2x128_si256(
-          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
-      // average between previous pack to the current
-      src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
-      MERGE_WITH_SRC(src_pack, zero_reg)
-      CALC_SUM_SSE_INSIDE_LOOP
-    } else {
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load and insert source and next row source
-      LOAD_SRC_NEXT_BYTE_INSERT
-      MERGE_WITH_SRC(src_reg, src_next_reg)
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      src += src_stride << 1;
-      for (i = 0; i < height - 2; i += 2) {
-        LOAD_SRC_NEXT_BYTE_INSERT
-        LOAD_DST_INSERT
-        MERGE_WITH_SRC(src_reg, src_next_reg)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
-        // average between previous pack to the current
-        MERGE_WITH_SRC(src_pack, src_next_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride << 1;
-        dst += dst_stride << 1;
-      }
-      // last 2 rows processing happens here
-      LOAD_SRC_MERGE_128BIT(xfilter)
-      LOAD_DST_INSERT
-      FILTER_SRC_128BIT(filter_128bit)
-      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
-      src_next_reg = _mm256_permute2x128_si256(
-          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
-      MERGE_WITH_SRC(src_pack, src_next_reg)
-      FILTER_SRC(yfilter)
-      CALC_SUM_SSE_INSIDE_LOOP
-    }
+#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height)                           \
+  unsigned int aom_sub_pixel_variance16x##height##_avx2(                      \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
+    __m256i zero_reg;                                                         \
+    int i, sum;                                                               \
+    sum_reg = _mm256_setzero_si256();                                         \
+    sse_reg = _mm256_setzero_si256();                                         \
+    zero_reg = _mm256_setzero_si256();                                        \
+                                                                              \
+    /* x_offset = 0 and y_offset = 0 */                                       \
+    if (x_offset == 0) {                                                      \
+      if (y_offset == 0) {                                                    \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = 4 */                                   \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          AVG_NEXT_SRC_INSERT(src_reg, src_stride)                            \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = bilin interpolation */                 \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg;                                    \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          MERGE_NEXT_SRC_INSERT(src_reg, src_stride)                          \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = 4  and y_offset = 0 */                                    \
+    } else if (x_offset == 4) {                                               \
+      if (y_offset == 0) {                                                    \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          LOAD_DST_INSERT                                                     \
+          /* average between current and next stride source */                \
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);                   \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = 4 */                                  \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg, src_avg, src_temp;                              \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
+          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
+          src_temp = _mm256_avg_epu8(src_avg, src_temp);                      \
+          LOAD_DST_INSERT                                                     \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_temp, zero_reg)                                  \
+          /* save current source average */                                   \
+          src_avg = src_next_reg;                                             \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride << 1;                                             \
+          src += src_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
+        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
+        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
+        LOAD_DST_INSERT                                                       \
+        src_avg = _mm256_avg_epu8(src_avg, src_next_reg);                     \
+        MERGE_WITH_SRC(src_avg, zero_reg)                                     \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      } else {                                                                \
+        /* x_offset = 4  and y_offset = bilin interpolation */                \
+        __m256i filter, pw8, src_next_reg, src_avg, src_temp;                 \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
+          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
+          LOAD_DST_INSERT                                                     \
+          MERGE_WITH_SRC(src_avg, src_temp)                                   \
+          /* save current source average */                                   \
+          src_avg = src_next_reg;                                             \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride << 1;                                             \
+          src += src_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
+        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
+        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
+        LOAD_DST_INSERT                                                       \
+        MERGE_WITH_SRC(src_avg, src_next_reg)                                 \
+        FILTER_SRC(filter)                                                    \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      }                                                                       \
+      /* x_offset = bilin interpolation and y_offset = 0 */                   \
+    } else {                                                                  \
+      if (y_offset == 0) {                                                    \
+        __m256i filter, pw8, src_next_reg;                                    \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          MERGE_NEXT_SRC_INSERT(src_reg, 1)                                   \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = 4 */                 \
+      } else if (y_offset == 4) {                                             \
+        __m256i filter, pw8, src_next_reg, src_pack;                          \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
+        FILTER_SRC(filter)                                                    \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          LOAD_DST_INSERT                                                     \
+          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
+          /* average between previous pack to the current */                  \
+          src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                 \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src_pack = src_reg;                                                 \
+          src += src_stride << 1;                                             \
+          dst += dst_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        LOAD_SRC_MERGE_128BIT(filter)                                         \
+        LOAD_DST_INSERT                                                       \
+        FILTER_SRC_128BIT(filter_128bit)                                      \
+        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
+        /* average between previous pack to the current */                    \
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                   \
+        MERGE_WITH_SRC(src_pack, zero_reg)                                    \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      } else {                                                                \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
+         */                                                                   \
+        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
+        x_offset <<= 5;                                                       \
+        xfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        y_offset <<= 5;                                                       \
+        yfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
+        FILTER_SRC(xfilter)                                                   \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          LOAD_DST_INSERT                                                     \
+          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
+          FILTER_SRC(xfilter)                                                 \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
+          /* average between previous pack to the current */                  \
+          MERGE_WITH_SRC(src_pack, src_next_reg)                              \
+          /* filter the source */                                             \
+          FILTER_SRC(yfilter)                                                 \
+          src_pack = src_reg;                                                 \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride << 1;                                             \
+          dst += dst_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        LOAD_SRC_MERGE_128BIT(xfilter)                                        \
+        LOAD_DST_INSERT                                                       \
+        FILTER_SRC_128BIT(filter_128bit)                                      \
+        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
+        MERGE_WITH_SRC(src_pack, src_next_reg)                                \
+        FILTER_SRC(yfilter)                                                   \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      }                                                                       \
+    }                                                                         \
+    CALC_SUM_AND_SSE                                                          \
+    _mm256_zeroupper();                                                       \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height));   \
   }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
 
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
+MAKE_SUB_PIXEL_VAR_16XH(32, 5)
+MAKE_SUB_PIXEL_VAR_16XH(16, 4)
+MAKE_SUB_PIXEL_VAR_16XH(8, 3)
+#if !CONFIG_REALTIME_ONLY
+MAKE_SUB_PIXEL_VAR_16XH(64, 6)
+MAKE_SUB_PIXEL_VAR_16XH(4, 2)
+#endif
 
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } else if (y_offset == 4) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
+#define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height)                       \
+  int aom_sub_pixel_avg_variance32x##height##_imp_avx2(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i sec_reg;                                                          \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
+    __m256i zero_reg;                                                         \
+    int i, sum;                                                               \
+    sum_reg = _mm256_setzero_si256();                                         \
+    sse_reg = _mm256_setzero_si256();                                         \
+    zero_reg = _mm256_setzero_si256();                                        \
+                                                                              \
+    /* x_offset = 0 and y_offset = 0 */                                       \
+    if (x_offset == 0) {                                                      \
+      if (y_offset == 0) {                                                    \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, src_stride)                                   \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = bilin interpolation */                 \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg;                                    \
+                                                                              \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = 4  and y_offset = 0 */                                    \
+    } else if (x_offset == 4) {                                               \
+      if (y_offset == 0) {                                                    \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = 4 */                                  \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg, src_avg;                                        \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          /* save current source average */                                   \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          /* average between previous average to current average */           \
+          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = bilin interpolation */                \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg, src_avg;                           \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          /* save current source average */                                   \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          MERGE_WITH_SRC(src_avg, src_reg)                                    \
+          FILTER_SRC(filter)                                                  \
+          src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
+          sec += sec_stride;                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = bilin interpolation and y_offset = 0 */                   \
+    } else {                                                                  \
+      if (y_offset == 0) {                                                    \
+        __m256i filter, pw8, src_next_reg;                                    \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          sec += sec_stride;                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = 4 */                 \
+      } else if (y_offset == 4) {                                             \
+        __m256i filter, pw8, src_next_reg, src_pack;                          \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+        FILTER_SRC(filter)                                                    \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* average between previous pack to the current */                  \
+          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
+          sec += sec_stride;                                                  \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          src_pack = src_reg;                                                 \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
+         */                                                                   \
+      } else {                                                                \
+        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
+        x_offset <<= 5;                                                       \
+        xfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        y_offset <<= 5;                                                       \
+        yfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+                                                                              \
+        FILTER_SRC(xfilter)                                                   \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(xfilter)                                                 \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* merge previous pack to current pack source */                    \
+          MERGE_WITH_SRC(src_pack, src_reg)                                   \
+          /* filter the source */                                             \
+          FILTER_SRC(yfilter)                                                 \
+          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);             \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          src_pack = src_reg;                                                 \
+          sec += sec_stride;                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    CALC_SUM_AND_SSE                                                          \
+    _mm256_zeroupper();                                                       \
+    return sum;                                                               \
+  }                                                                           \
+  unsigned int aom_sub_pixel_avg_variance32x##height##_avx2(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse,                  \
+      const uint8_t *sec_ptr) {                                               \
+    const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2(         \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32,    \
+        sse);                                                                 \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
+  }
 
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 4  and y_offset = 0
-  } else if (x_offset == 4) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 4  and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 4  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 4
-    } else if (y_offset == 4) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
 
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)            \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, &sse2);                                           \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
   }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5)
diff --git a/aom_dsp/x86/variance_impl_ssse3.c b/aom_dsp/x86/variance_impl_ssse3.c
index 66b0d7d84..699002195 100644
--- a/aom_dsp/x86/variance_impl_ssse3.c
+++ b/aom_dsp/x86/variance_impl_ssse3.c
@@ -25,8 +25,8 @@ void aom_var_filter_block2d_bil_first_pass_ssse3(
   // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
   const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
   const __m128i r = _mm_set1_epi16(round);
-  const uint8_t f0 = filter[0] >> 1;
-  const uint8_t f1 = filter[1] >> 1;
+  const int8_t f0 = (int8_t)(filter[0] >> 1);
+  const int8_t f1 = (int8_t)(filter[1] >> 1);
   const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
                                         f0, f1, f0, f1, f0, f1);
   unsigned int i, j;
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index a0223a9e4..7d4ff4f8f 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -36,8 +36,8 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
 }
 
@@ -103,7 +103,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_unpacklo_epi16(vsum, vsum);
   vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
+  *sum = (int)add32x4_sse2(vsum);
 }
 
 // Can handle 1024 pixels' diff sum (such as 32x32)
@@ -113,7 +113,7 @@ static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
   *sse = add32x4_sse2(vsse);
 
   vsum = sum_to_32bit_sse2(vsum);
-  *sum = add32x4_sse2(vsum);
+  *sum = (int)add32x4_sse2(vsum);
 }
 
 static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
@@ -240,9 +240,11 @@ void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
   variance_final_128_pel_sse2(vsse, vsum, sse, sum);
 }
 
-void aom_get_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   unsigned int *sse, int *sum) {
+void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse8x8, int *sum8x8,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var8x8) {
   // Loop over 4 8x8 blocks. Process one 8x32 block.
   for (int k = 0; k < 4; k++) {
     const uint8_t *src = src_ptr;
@@ -259,8 +261,14 @@ void aom_get_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
       src += src_stride;
       ref += ref_stride;
     }
-    variance_final_128_pel_sse2(vsse, vsum, &sse[k], &sum[k]);
+    variance_final_128_pel_sse2(vsse, vsum, &sse8x8[k], &sum8x8[k]);
   }
+
+  // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++)
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
 }
 
 #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
@@ -314,7 +322,7 @@ AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024)
       ref += (ref_stride * uh);                                               \
     }                                                                         \
     *sse = add32x4_sse2(vsse);                                                \
-    int sum = add32x4_sse2(vsum);                                             \
+    int sum = (int)add32x4_sse2(vsum);                                        \
     assert(sum <= 255 * bw * bh);                                             \
     assert(sum >= -255 * bw * bh);                                            \
     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
@@ -673,13 +681,13 @@ uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
   uint64_t sum = 0;
   __m128i dst0_8x8, dst1_8x8, dst_16x8;
   __m128i src0_16x4, src1_16x4, src_16x8;
-  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+  __m128i res0_32x4, res0_64x2, res1_64x2;
   __m128i sub_result_16x8;
   const __m128i zeros = _mm_setzero_si128();
   __m128i square_result = _mm_setzero_si128();
   for (int i = 0; i < h; i += 2) {
-    dst0_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
-    dst1_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+    dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
     dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
 
     src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
@@ -688,26 +696,17 @@ uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
 
     sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
 
-    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
-    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
-
-    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
-    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+    res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
 
-    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
-    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
-    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
-    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
+    res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
 
-    square_result = _mm_add_epi64(
-        square_result,
-        _mm_add_epi64(
-            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
-            res3_64x4));
+    square_result =
+        _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
   }
-  const __m128i sum_1x64 =
+  const __m128i sum_64x1 =
       _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
-  xx_storel_64(&sum, sum_1x64);
+  xx_storel_64(&sum, sum_64x1);
   return sum;
 }
 
@@ -716,7 +715,7 @@ uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
   uint64_t sum = 0;
   __m128i dst_8x8, dst_16x8;
   __m128i src_16x8;
-  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+  __m128i res0_32x4, res0_64x2, res1_64x2;
   __m128i sub_result_16x8;
   const __m128i zeros = _mm_setzero_si128();
   __m128i square_result = _mm_setzero_si128();
@@ -729,26 +728,17 @@ uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
 
     sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
 
-    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
-    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
-
-    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
-    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+    res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
 
-    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
-    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
-    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
-    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
+    res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
 
-    square_result = _mm_add_epi64(
-        square_result,
-        _mm_add_epi64(
-            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
-            res3_64x4));
+    square_result =
+        _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
   }
-  const __m128i sum_1x64 =
+  const __m128i sum_64x1 =
       _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
-  xx_storel_64(&sum, sum_1x64);
+  xx_storel_64(&sum, sum_64x1);
   return sum;
 }
 
@@ -762,3 +752,17 @@ uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
     default: assert(0 && "unsupported width"); return -1;
   }
 }
+
+uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must be satisfied");
+  const int num_blks = 16 / w;
+  uint64_t sum = 0;
+  for (int i = 0; i < num_blks; i++) {
+    sum += aom_mse_wxh_16bit_sse2(dst, dstride, src, w, w, h);
+    dst += w;
+    src += (w * h);
+  }
+  return sum;
+}
diff --git a/aom_mem/aom_mem.c b/aom_mem/aom_mem.c
index f13ee2fa2..807ddcf05 100644
--- a/aom_mem/aom_mem.c
+++ b/aom_mem/aom_mem.c
@@ -80,10 +80,3 @@ void aom_free(void *memblk) {
     free(addr);
   }
 }
-
-void *aom_memset16(void *dest, int val, size_t length) {
-  size_t i;
-  uint16_t *dest16 = (uint16_t *)dest;
-  for (i = 0; i < length; i++) *dest16++ = val;
-  return dest;
-}
diff --git a/aom_mem/aom_mem.h b/aom_mem/aom_mem.h
index bc5d8bca3..ca4af7fc6 100644
--- a/aom_mem/aom_mem.h
+++ b/aom_mem/aom_mem.h
@@ -36,7 +36,13 @@ void *aom_memalign(size_t align, size_t size);
 void *aom_malloc(size_t size);
 void *aom_calloc(size_t num, size_t size);
 void aom_free(void *memblk);
-void *aom_memset16(void *dest, int val, size_t length);
+
+static INLINE void *aom_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
 
 /*returns an addr aligned to the byte boundary specified by align*/
 #define aom_align_addr(addr, align) \
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index e9bb8adbc..e39684202 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -71,6 +71,8 @@
 
 #define ALIGN_POWER_OF_TWO(value, n) \
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+#define ALIGN_POWER_OF_TWO_UNSIGNED(value, n) \
+  (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1))
 
 #define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
 
diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index 79cbd02bf..f390dfac6 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h
@@ -387,7 +387,7 @@ static INLINE unsigned int x87_set_double_precision(void) {
   // Reserved                      01B
   // Double Precision (53-Bits)    10B
   // Extended Precision (64-Bits)  11B
-  x87_set_control_word((mode & ~0x300) | 0x200);
+  x87_set_control_word((mode & ~0x300u) | 0x200u);
   return mode;
 }
 
diff --git a/aom_scale/aom_scale.cmake b/aom_scale/aom_scale.cmake
index e83299320..ea94dbc06 100644
--- a/aom_scale/aom_scale.cmake
+++ b/aom_scale/aom_scale.cmake
@@ -20,20 +20,12 @@ list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h"
             "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
             "${AOM_ROOT}/aom_scale/yv12config.h")
 
-list(APPEND AOM_SCALE_INTRIN_DSPR2
-            "${AOM_ROOT}/aom_scale/mips/dspr2/yv12extend_dspr2.c")
-
 # Creates the aom_scale build target and makes libaom depend on it. The libaom
 # target must exist before this function is called.
 function(setup_aom_scale_targets)
   add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES})
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
 
-  if(HAVE_DSPR2)
-    add_intrinsics_object_library("" "dspr2" "aom_scale"
-                                  "AOM_SCALE_INTRIN_DSPR2")
-  endif()
-
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
   if(BUILD_SHARED_LIBS)
     target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_scale>)
diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl
index eef6f16a7..e84b6f95e 100644
--- a/aom_scale/aom_scale_rtcd.pl
+++ b/aom_scale/aom_scale_rtcd.pl
@@ -45,11 +45,11 @@ add_proto qw/void aom_yv12_partial_coloc_copy_u/, "const struct yv12_buffer_conf
 add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2";
 add_proto qw/void aom_yv12_partial_coloc_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
 
+add_proto qw/void aom_extend_frame_borders_plane_row/, "const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end";
+
 add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
-specialize qw/aom_extend_frame_borders dspr2/;
 
 add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
-specialize qw/aom_extend_frame_inner_borders dspr2/;
 
 add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
 1;
diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index 5d797c8a5..997ff5434 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c
@@ -21,19 +21,20 @@
 
 static void extend_plane(uint8_t *const src, int src_stride, int width,
                          int height, int extend_top, int extend_left,
-                         int extend_bottom, int extend_right) {
+                         int extend_bottom, int extend_right, int v_start,
+                         int v_end) {
   assert(src != NULL);
   int i;
   const int linesize = extend_left + extend_right + width;
   assert(linesize <= src_stride);
 
   /* copy the left and right most columns out */
-  uint8_t *src_ptr1 = src;
-  uint8_t *src_ptr2 = src + width - 1;
-  uint8_t *dst_ptr1 = src - extend_left;
+  uint8_t *src_ptr1 = src + v_start * src_stride;
+  uint8_t *src_ptr2 = src + v_start * src_stride + width - 1;
+  uint8_t *dst_ptr1 = src + v_start * src_stride - extend_left;
   uint8_t *dst_ptr2 = src_ptr2 + 1;
 
-  for (i = 0; i < height; ++i) {
+  for (i = v_start; i < v_end; ++i) {
     memset(dst_ptr1, src_ptr1[0], extend_left);
     memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_stride;
@@ -65,19 +66,20 @@ static void extend_plane(uint8_t *const src, int src_stride, int width,
 #if CONFIG_AV1_HIGHBITDEPTH
 static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
                               int height, int extend_top, int extend_left,
-                              int extend_bottom, int extend_right) {
+                              int extend_bottom, int extend_right, int v_start,
+                              int v_end) {
   int i;
   const int linesize = extend_left + extend_right + width;
   assert(linesize <= src_stride);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
   /* copy the left and right most columns out */
-  uint16_t *src_ptr1 = src;
-  uint16_t *src_ptr2 = src + width - 1;
-  uint16_t *dst_ptr1 = src - extend_left;
+  uint16_t *src_ptr1 = src + v_start * src_stride;
+  uint16_t *src_ptr2 = src + v_start * src_stride + width - 1;
+  uint16_t *dst_ptr1 = src + v_start * src_stride - extend_left;
   uint16_t *dst_ptr2 = src_ptr2 + 1;
 
-  for (i = 0; i < height; ++i) {
+  for (i = v_start; i < v_end; ++i) {
     aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
     aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_stride;
@@ -107,6 +109,41 @@ static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+void aom_extend_frame_borders_plane_row_c(const YV12_BUFFER_CONFIG *ybf,
+                                          int plane, int v_start, int v_end) {
+  const int ext_size = ybf->border;
+  const int ss_x = ybf->subsampling_x;
+  const int ss_y = ybf->subsampling_y;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  const int is_uv = plane > 0;
+  const int top = ext_size >> (is_uv ? ss_y : 0);
+  const int left = ext_size >> (is_uv ? ss_x : 0);
+  const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+  const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+  const int extend_top_border = (v_start == 0);
+  const int extend_bottom_border = (v_end == ybf->crop_heights[is_uv]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
+                      ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+                      extend_top_border ? top : 0, left,
+                      extend_bottom_border ? bottom : 0, right, v_start, v_end);
+    return;
+  }
+#endif
+
+  extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+               ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+               extend_top_border ? top : 0, left,
+               extend_bottom_border ? bottom : 0, right, v_start, v_end);
+}
+
 void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
                                      const int num_planes) {
   assert(ybf->border % 2 == 0);
@@ -124,7 +161,8 @@ void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
           ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
           ybf->crop_heights[is_uv], plane_border, plane_border,
           plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
-          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
+          ybf->crop_heights[is_uv]);
     }
     return;
   }
@@ -137,7 +175,8 @@ void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
                  ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
                  plane_border, plane_border,
                  plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
-                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
+                 ybf->crop_heights[is_uv]);
   }
 }
 
@@ -161,7 +200,7 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
       const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
       extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
                         ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
-                        left, bottom, right);
+                        left, bottom, right, 0, ybf->crop_heights[is_uv]);
     }
     return;
   }
@@ -175,7 +214,7 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
     const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
     extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
                  ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
-                 bottom, right);
+                 bottom, right, 0, ybf->crop_heights[is_uv]);
   }
 }
 
@@ -199,17 +238,17 @@ void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                      ybf->y_crop_height, ext_size, ext_size,
-                      ext_size + ybf->y_height - ybf->y_crop_height,
-                      ext_size + ybf->y_width - ybf->y_crop_width);
+    extend_plane_high(
+        ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height,
+        ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height,
+        ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height);
     return;
   }
 #endif
-  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-               ybf->y_crop_height, ext_size, ext_size,
-               ext_size + ybf->y_height - ybf->y_crop_height,
-               ext_size + ybf->y_width - ybf->y_crop_width);
+  extend_plane(
+      ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height,
+      ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height,
+      ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height);
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_scale/mips/dspr2/yv12extend_dspr2.c b/aom_scale/mips/dspr2/yv12extend_dspr2.c
deleted file mode 100644
index 8556e71a2..000000000
--- a/aom_scale/mips/dspr2/yv12extend_dspr2.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom_scale/yv12config.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_scale/aom_scale.h"
-
-#if HAVE_DSPR2
-static void extend_plane(uint8_t *const src, int src_stride, int width,
-                         int height, int extend_top, int extend_left,
-                         int extend_bottom, int extend_right) {
-  int i, j;
-  uint8_t *left_src, *right_src;
-  uint8_t *left_dst_start, *right_dst_start;
-  uint8_t *left_dst, *right_dst;
-  uint8_t *top_src, *bot_src;
-  uint8_t *top_dst, *bot_dst;
-  uint32_t left_pix;
-  uint32_t right_pix;
-  uint32_t linesize;
-
-  /* copy the left and right most columns out */
-  left_src = src;
-  right_src = src + width - 1;
-  left_dst_start = src - extend_left;
-  right_dst_start = src + width;
-
-  for (i = height; i--;) {
-    left_dst = left_dst_start;
-    right_dst = right_dst_start;
-
-    __asm__ __volatile__(
-        "lb        %[left_pix],     0(%[left_src])      \n\t"
-        "lb        %[right_pix],    0(%[right_src])     \n\t"
-        "replv.qb  %[left_pix],     %[left_pix]         \n\t"
-        "replv.qb  %[right_pix],    %[right_pix]        \n\t"
-
-        : [left_pix] "=&r"(left_pix), [right_pix] "=&r"(right_pix)
-        : [left_src] "r"(left_src), [right_src] "r"(right_src));
-
-    for (j = extend_left / 4; j--;) {
-      __asm__ __volatile__(
-          "sw     %[left_pix],    0(%[left_dst])     \n\t"
-          "sw     %[right_pix],   0(%[right_dst])    \n\t"
-
-          :
-          : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix),
-            [right_dst] "r"(right_dst), [right_pix] "r"(right_pix));
-
-      left_dst += 4;
-      right_dst += 4;
-    }
-
-    for (j = extend_left % 4; j--;) {
-      __asm__ __volatile__(
-          "sb     %[left_pix],    0(%[left_dst])     \n\t"
-          "sb     %[right_pix],   0(%[right_dst])     \n\t"
-
-          :
-          : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix),
-            [right_dst] "r"(right_dst), [right_pix] "r"(right_pix));
-
-      left_dst += 1;
-      right_dst += 1;
-    }
-
-    left_src += src_stride;
-    right_src += src_stride;
-    left_dst_start += src_stride;
-    right_dst_start += src_stride;
-  }
-
-  /* Now copy the top and bottom lines into each line of the respective
-   * borders
-   */
-  top_src = src - extend_left;
-  bot_src = src + src_stride * (height - 1) - extend_left;
-  top_dst = src + src_stride * (-extend_top) - extend_left;
-  bot_dst = src + src_stride * (height)-extend_left;
-  linesize = extend_left + extend_right + width;
-  assert(linesize <= src_stride);
-
-  for (i = 0; i < extend_top; i++) {
-    memcpy(top_dst, top_src, linesize);
-    top_dst += src_stride;
-  }
-
-  for (i = 0; i < extend_bottom; i++) {
-    memcpy(bot_dst, bot_src, linesize);
-    bot_dst += src_stride;
-  }
-}
-
-static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
-  const int c_w = ybf->uv_crop_width;
-  const int c_h = ybf->uv_crop_height;
-  const int ss_x = ybf->subsampling_x;
-  const int ss_y = ybf->subsampling_y;
-  const int c_et = ext_size >> ss_y;
-  const int c_el = ext_size >> ss_x;
-  const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height;
-  const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width;
-
-  assert(ybf->y_height - ybf->y_crop_height < 16);
-  assert(ybf->y_width - ybf->y_crop_width < 16);
-  assert(ybf->y_height - ybf->y_crop_height >= 0);
-  assert(ybf->y_width - ybf->y_crop_width >= 0);
-
-  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-               ybf->y_crop_height, ext_size, ext_size,
-               ext_size + ybf->y_height - ybf->y_crop_height,
-               ext_size + ybf->y_width - ybf->y_crop_width);
-
-  extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
-
-  extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
-}
-
-void aom_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
-                                    const int num_planes) {
-  extend_frame(ybf, ybf->border, num_planes);
-}
-
-void aom_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
-                                          const int num_planes) {
-  const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
-                           ? AOMINNERBORDERINPIXELS
-                           : ybf->border;
-  extend_frame(ybf, inner_bw, num_planes);
-}
-#endif
diff --git a/aom_util/endian_inl.h b/aom_util/endian_inl.h
index f536ec5b8..b69102a7f 100644
--- a/aom_util/endian_inl.h
+++ b/aom_util/endian_inl.h
@@ -64,11 +64,6 @@
 #define HAVE_BUILTIN_BSWAP64
 #endif
 
-#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
-    defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6)
-#define AOM_USE_MIPS32_R2
-#endif
-
 static INLINE uint16_t BSwap16(uint16_t x) {
 #if defined(HAVE_BUILTIN_BSWAP16)
   return __builtin_bswap16(x);
@@ -81,15 +76,7 @@ static INLINE uint16_t BSwap16(uint16_t x) {
 }
 
 static INLINE uint32_t BSwap32(uint32_t x) {
-#if defined(AOM_USE_MIPS32_R2)
-  uint32_t ret;
-  __asm__ volatile(
-      "wsbh   %[ret], %[x]          \n\t"
-      "rotr   %[ret], %[ret],  16   \n\t"
-      : [ret] "=r"(ret)
-      : [x] "r"(x));
-  return ret;
-#elif defined(HAVE_BUILTIN_BSWAP32)
+#if defined(HAVE_BUILTIN_BSWAP32)
   return __builtin_bswap32(x);
 #elif defined(__i386__) || defined(__x86_64__)
   uint32_t swapped_bytes;
diff --git a/apps/aomdec.c b/apps/aomdec.c
index 2c74dd36c..ab4a37fa5 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c
@@ -181,13 +181,15 @@ void usage_exit(void) {
   exit(EXIT_FAILURE);
 }
 
-static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
-                          size_t *buffer_size) {
-  char raw_hdr[RAW_FRAME_HDR_SZ];
+static int raw_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer,
+                          size_t *bytes_read, size_t *buffer_size) {
+  unsigned char raw_hdr[RAW_FRAME_HDR_SZ];
   size_t frame_size = 0;
 
-  if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) aom_tools_warn("Failed to read RAW frame size\n");
+  if (read_from_input(input_ctx, RAW_FRAME_HDR_SZ, raw_hdr) !=
+      RAW_FRAME_HDR_SZ) {
+    if (!input_eof(input_ctx))
+      aom_tools_warn("Failed to read RAW frame size\n");
   } else {
     const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
     const size_t kFrameTooSmallThreshold = 256 * 1024;
@@ -217,8 +219,8 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
     }
   }
 
-  if (!feof(infile)) {
-    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+  if (!input_eof(input_ctx)) {
+    if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) {
       aom_tools_warn("Failed to read full frame\n");
       return 1;
     }
@@ -237,10 +239,10 @@ static int read_frame(struct AvxDecInputContext *input, uint8_t **buf,
                              buffer_size);
 #endif
     case FILE_TYPE_RAW:
-      return raw_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
+      return raw_read_frame(input->aom_input_ctx, buf, bytes_in_buffer,
                             buffer_size);
     case FILE_TYPE_IVF:
-      return ivf_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
+      return ivf_read_frame(input->aom_input_ctx, buf, bytes_in_buffer,
                             buffer_size, NULL);
     case FILE_TYPE_OBU:
       return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer,
@@ -255,7 +257,7 @@ static int file_is_raw(struct AvxInputContext *input) {
   aom_codec_stream_info_t si;
   memset(&si, 0, sizeof(si));
 
-  if (fread(buf, 1, 32, input->file) == 32) {
+  if (buffer_input(input, 32, buf, /*buffered=*/true) == 32) {
     int i;
 
     if (mem_get_le32(buf) < 256 * 1024 * 1024) {
@@ -274,7 +276,7 @@ static int file_is_raw(struct AvxInputContext *input) {
     }
   }
 
-  rewind(input->file);
+  rewind_detect(input);
   return is_raw;
 }
 
@@ -599,11 +601,13 @@ static int main_loop(int argc, const char **argv_) {
     fprintf(stderr, "No input file specified!\n");
     usage_exit();
   }
+
+  const bool using_file = strcmp(fn, "-") != 0;
   /* Open file */
-  infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
+  infile = using_file ? fopen(fn, "rb") : set_binary_mode(stdin);
 
   if (!infile) {
-    fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
+    fatal("Failed to open input file '%s'", using_file ? fn : "stdin");
   }
 #if CONFIG_OS_SUPPORT
   /* Make sure we don't dump to the terminal, unless forced to with -o - */
@@ -617,21 +621,30 @@ static int main_loop(int argc, const char **argv_) {
 #endif
   input.aom_input_ctx->filename = fn;
   input.aom_input_ctx->file = infile;
-  if (file_is_ivf(input.aom_input_ctx)) {
-    input.aom_input_ctx->file_type = FILE_TYPE_IVF;
-    is_ivf = 1;
-  }
+
+  // TODO(https://crbug.com/aomedia/1706): webm type does not support reading
+  // from stdin yet, and file_is_webm is not using the detect buffer when
+  // determining the type. Therefore it should only be checked when using a file
+  // and needs to be checked prior to other types.
+  if (false) {
 #if CONFIG_WEBM_IO
-  else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
+  } else if (using_file && file_is_webm(input.webm_ctx, input.aom_input_ctx)) {
     input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
 #endif
-  else if (file_is_obu(&obu_ctx))
+  } else if (file_is_ivf(input.aom_input_ctx)) {
+    input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+    is_ivf = 1;
+  } else if (file_is_obu(&obu_ctx)) {
     input.aom_input_ctx->file_type = FILE_TYPE_OBU;
-  else if (file_is_raw(input.aom_input_ctx))
+  } else if (file_is_raw(input.aom_input_ctx)) {
     input.aom_input_ctx->file_type = FILE_TYPE_RAW;
-  else {
+  } else {
     fprintf(stderr, "Unrecognized input file type.\n");
-#if !CONFIG_WEBM_IO
+#if CONFIG_WEBM_IO
+    if (!using_file) {
+      fprintf(stderr, "aomdec does not support piped WebM input.\n");
+    }
+#else
     fprintf(stderr, "aomdec was built without WebM container support.\n");
 #endif
     free(argv);
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 246bb69c8..ef208fd3e 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -450,7 +450,9 @@ const arg_def_t *av1_key_val_args[] = {
   &g_av1_codec_arg_defs.second_pass_log,
   &g_av1_codec_arg_defs.fwd_kf_dist,
   &g_av1_codec_arg_defs.strict_level_conformance,
+  &g_av1_codec_arg_defs.sb_qp_sweep,
   &g_av1_codec_arg_defs.dist_metric,
+  &g_av1_codec_arg_defs.kf_max_pyr_height,
   NULL,
 };
 
@@ -2583,11 +2585,18 @@ int main(int argc, const char **argv_) {
 
         for (int i = 0; i < num_operating_points; i++) {
           if (levels[i] > target_levels[i]) {
-            aom_tools_warn(
-                "Failed to encode to target level %d.%d for operating point "
-                "%d. The output level is %d.%d",
-                2 + (target_levels[i] >> 2), target_levels[i] & 3, i,
-                2 + (levels[i] >> 2), levels[i] & 3);
+            if (levels[i] == 31) {
+              aom_tools_warn(
+                  "Failed to encode to target level %d.%d for operating point "
+                  "%d. The output level is SEQ_LEVEL_MAX",
+                  2 + (target_levels[i] >> 2), target_levels[i] & 3, i);
+            } else {
+              aom_tools_warn(
+                  "Failed to encode to target level %d.%d for operating point "
+                  "%d. The output level is %d.%d",
+                  2 + (target_levels[i] >> 2), target_levels[i] & 3, i,
+                  2 + (levels[i] >> 2), levels[i] & 3);
+            }
           }
         }
       }
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index 58e6f4e23..abfd4b371 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -427,8 +427,9 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
               "Enable diagonal (D45 to D203) intra prediction modes, which are "
               "a subset of directional modes; has no effect if "
               "enable-directional-intra is 0 (0: false, 1: true (default))"),
-  .force_video_mode = ARG_DEF(NULL, "force-video-mode", 1,
-                              "Force video mode (0: false, 1: true (default))"),
+  .force_video_mode = ARG_DEF(
+      NULL, "force-video-mode", 1,
+      "Force video mode even for a single frame (0: false (default), 1: true)"),
   .enable_obmc = ARG_DEF(NULL, "enable-obmc", 1,
                          "Enable OBMC (0: false, 1: true (default))"),
   .enable_overlay =
@@ -672,5 +673,15 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
       ARG_DEF(NULL, "strict-level-conformance", 1,
               "When set to 1, exit the encoder when it fails to encode "
               "to a given target level"),
+  .kf_max_pyr_height = ARG_DEF(
+      NULL, "kf-max-pyr-height", 1,
+      "Maximum height of pyramid structure used for the GOP starting with a "
+      "key frame (-1 to 5). When set to -1 (default), it does not have any "
+      "effect. The actual maximum pyramid height will be the minimum of this "
+      "value and the value of gf_max_pyr_height."),
+  .sb_qp_sweep =
+      ARG_DEF(NULL, "sb-qp-sweep", 1,
+              "When set to 1, enable the superblock level qp sweep for a "
+              "given lambda to minimize the rdcost."),
 #endif  // CONFIG_AV1_ENCODER
 };
diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index 5c30f0d9b..e15a84c7b 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h
@@ -231,6 +231,8 @@ typedef struct av1_codec_arg_definitions {
   arg_def_t second_pass_log;
   arg_def_t auto_intra_tools_off;
   arg_def_t strict_level_conformance;
+  arg_def_t kf_max_pyr_height;
+  arg_def_t sb_qp_sweep;
 #endif  // CONFIG_AV1_ENCODER
 } av1_codec_arg_definitions_t;
 
diff --git a/av1/av1.cmake b/av1/av1.cmake
index c017427a2..ae1eba7d0 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -39,6 +39,7 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/cfl.c"
             "${AOM_ROOT}/av1/common/cfl.h"
             "${AOM_ROOT}/av1/common/common.h"
+            "${AOM_ROOT}/av1/common/common_data.c"
             "${AOM_ROOT}/av1/common/common_data.h"
             "${AOM_ROOT}/av1/common/convolve.c"
             "${AOM_ROOT}/av1/common/convolve.h"
@@ -66,6 +67,7 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/quant_common.h"
             "${AOM_ROOT}/av1/common/reconinter.c"
             "${AOM_ROOT}/av1/common/reconinter.h"
+            "${AOM_ROOT}/av1/common/reconinter_template.inc"
             "${AOM_ROOT}/av1/common/reconintra.c"
             "${AOM_ROOT}/av1/common/reconintra.h"
             "${AOM_ROOT}/av1/common/resize.c"
@@ -135,10 +137,6 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/compound_type.h"
             "${AOM_ROOT}/av1/encoder/context_tree.c"
             "${AOM_ROOT}/av1/encoder/context_tree.h"
-            "${AOM_ROOT}/av1/encoder/corner_detect.c"
-            "${AOM_ROOT}/av1/encoder/corner_detect.h"
-            "${AOM_ROOT}/av1/encoder/corner_match.c"
-            "${AOM_ROOT}/av1/encoder/corner_match.h"
             "${AOM_ROOT}/av1/encoder/cost.c"
             "${AOM_ROOT}/av1/encoder/cost.h"
             "${AOM_ROOT}/av1/encoder/encodeframe.c"
@@ -209,8 +207,6 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/picklpf.h"
             "${AOM_ROOT}/av1/encoder/pickrst.c"
             "${AOM_ROOT}/av1/encoder/pickrst.h"
-            "${AOM_ROOT}/av1/encoder/ransac.c"
-            "${AOM_ROOT}/av1/encoder/ransac.h"
             "${AOM_ROOT}/av1/encoder/ratectrl.c"
             "${AOM_ROOT}/av1/encoder/ratectrl.h"
             "${AOM_ROOT}/av1/encoder/rc_utils.h"
@@ -333,7 +329,6 @@ list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
@@ -341,7 +336,6 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
@@ -364,12 +358,12 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c")
 
-list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
-            "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
-            "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
-            "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
+            "${AOM_ROOT}/av1/encoder/arm/crc32/hash_crc32.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
@@ -631,6 +625,16 @@ function(setup_av1_targets)
                                       "AOM_AV1_ENCODER_INTRIN_NEON")
       endif()
     endif()
+
+    if(HAVE_ARM_CRC32)
+      if(CONFIG_AV1_ENCODER)
+        if(AOM_AV1_ENCODER_INTRIN_ARM_CRC32)
+          add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "crc32"
+                                        "aom_av1_encoder"
+                                        "AOM_AV1_ENCODER_INTRIN_ARM_CRC32")
+        endif()
+      endif()
+    endif()
   endif()
 
   if(HAVE_VSX)
@@ -640,11 +644,6 @@ function(setup_av1_targets)
     endif()
   endif()
 
-  if(HAVE_MSA)
-    add_intrinsics_object_library("" "msa" "aom_av1_encoder"
-                                  "AOM_AV1_ENCODER_INTRIN_MSA")
-  endif()
-
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 872f56864..d1f027f97 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -172,6 +172,9 @@ struct av1_extracfg {
   int fwd_kf_dist;
 
   LOOPFILTER_CONTROL loopfilter_control;
+  // Indicates if the application of post-processing filters should be skipped
+  // on reconstructed frame.
+  unsigned int skip_postproc_filtering;
   // the name of the second pass output file when passes > 2
   const char *two_pass_output;
   const char *second_pass_log;
@@ -188,6 +191,8 @@ struct av1_extracfg {
   // "--enable_diagonal_intra".
   int auto_intra_tools_off;
   int strict_level_conformance;
+  int kf_max_pyr_height;
+  int sb_qp_sweep;
 };
 
 #if CONFIG_REALTIME_ONLY
@@ -344,10 +349,13 @@ static const struct av1_extracfg default_extra_cfg = {
   -1,              // passes
   -1,              // fwd_kf_dist
   LOOPFILTER_ALL,  // loopfilter_control
+  0,               // skip_postproc_filtering
   NULL,            // two_pass_output
   NULL,            // second_pass_log
   0,               // auto_intra_tools_off
   0,               // strict_level_conformance
+  -1,              // kf_max_pyr_height
+  0,               // sb_qp_sweep
 };
 #else
 static const struct av1_extracfg default_extra_cfg = {
@@ -490,10 +498,13 @@ static const struct av1_extracfg default_extra_cfg = {
   -1,              // passes
   -1,              // fwd_kf_dist
   LOOPFILTER_ALL,  // loopfilter_control
+  0,               // skip_postproc_filtering
   NULL,            // two_pass_output
   NULL,            // second_pass_log
   0,               // auto_intra_tools_off
   0,               // strict_level_conformance
+  -1,              // kf_max_pyr_height
+  0,               // sb_qp_sweep
 };
 #endif
 
@@ -535,7 +546,7 @@ static INLINE int gcd(int64_t a, int b) {
   return (int)a;
 }
 
-void av1_reduce_ratio(aom_rational64_t *ratio) {
+static void reduce_ratio(aom_rational64_t *ratio) {
   const int denom = gcd(ratio->num, ratio->den);
   ratio->num /= denom;
   ratio->den /= denom;
@@ -612,8 +623,16 @@ static aom_codec_err_t allocate_and_set_string(const char *src,
 static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
                                        const aom_codec_enc_cfg_t *cfg,
                                        const struct av1_extracfg *extra_cfg) {
-  RANGE_CHECK(cfg, g_w, 1, 65536);  // 16 bits available
-  RANGE_CHECK(cfg, g_h, 1, 65536);  // 16 bits available
+  RANGE_CHECK(cfg, g_w, 1, 65536);                        // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65536);                        // 16 bits available
+  RANGE_CHECK_HI(cfg, g_forced_max_frame_width, 65536);   // 16 bits available
+  RANGE_CHECK_HI(cfg, g_forced_max_frame_height, 65536);  // 16 bits available
+  if (cfg->g_forced_max_frame_width) {
+    RANGE_CHECK_HI(cfg, g_w, cfg->g_forced_max_frame_width);
+  }
+  if (cfg->g_forced_max_frame_height) {
+    RANGE_CHECK_HI(cfg, g_h, cfg->g_forced_max_frame_height);
+  }
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
   RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
@@ -829,16 +848,27 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
     const int level_idx = extra_cfg->target_seq_level_idx[i];
-    if (!is_valid_seq_level_idx(level_idx) && level_idx != SEQ_LEVELS) {
+    if (!is_valid_seq_level_idx(level_idx) &&
+        level_idx != SEQ_LEVEL_KEEP_STATS) {
       ERROR("Target sequence level index is invalid");
     }
   }
 
   RANGE_CHECK(extra_cfg, deltaq_strength, 0, 1000);
   RANGE_CHECK_HI(extra_cfg, loopfilter_control, 3);
+  RANGE_CHECK_BOOL(extra_cfg, skip_postproc_filtering);
   RANGE_CHECK_HI(extra_cfg, enable_cdef, 2);
   RANGE_CHECK_BOOL(extra_cfg, auto_intra_tools_off);
   RANGE_CHECK_BOOL(extra_cfg, strict_level_conformance);
+  RANGE_CHECK_BOOL(extra_cfg, sb_qp_sweep);
+
+  RANGE_CHECK(extra_cfg, kf_max_pyr_height, -1, 5);
+  if (extra_cfg->kf_max_pyr_height != -1 &&
+      extra_cfg->kf_max_pyr_height < (int)extra_cfg->gf_min_pyr_height) {
+    ERROR(
+        "The value of kf-max-pyr-height should not be smaller than "
+        "gf-min-pyr-height");
+  }
 
   return AOM_CODEC_OK;
 }
@@ -920,11 +950,10 @@ static void update_default_encoder_config(const cfg_options_t *cfg,
                                           struct av1_extracfg *extra_cfg) {
   extra_cfg->enable_cdef = (cfg->disable_cdef == 0) ? 1 : 0;
   extra_cfg->enable_restoration = (cfg->disable_lr == 0);
-  extra_cfg->superblock_size = (cfg->super_block_size == 64)
-                                   ? AOM_SUPERBLOCK_SIZE_64X64
-                                   : (cfg->super_block_size == 128)
-                                         ? AOM_SUPERBLOCK_SIZE_128X128
-                                         : AOM_SUPERBLOCK_SIZE_DYNAMIC;
+  extra_cfg->superblock_size =
+      (cfg->super_block_size == 64)    ? AOM_SUPERBLOCK_SIZE_64X64
+      : (cfg->super_block_size == 128) ? AOM_SUPERBLOCK_SIZE_128X128
+                                       : AOM_SUPERBLOCK_SIZE_DYNAMIC;
   extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0);
   extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0);
   extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0);
@@ -1121,7 +1150,7 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
       extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
   tool_cfg->superblock_size = extra_cfg->superblock_size;
   tool_cfg->enable_monochrome = cfg->monochrome;
-  tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr != 0;
   tool_cfg->enable_dual_filter = extra_cfg->enable_dual_filter;
   tool_cfg->enable_order_hint = extra_cfg->enable_order_hint;
   tool_cfg->enable_interintra_comp = extra_cfg->enable_interintra_comp;
@@ -1182,6 +1211,7 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
   algo_cfg->enable_tpl_model =
       resize_cfg->resize_mode ? 0 : extra_cfg->enable_tpl_model;
   algo_cfg->loopfilter_control = extra_cfg->loopfilter_control;
+  algo_cfg->skip_postproc_filtering = extra_cfg->skip_postproc_filtering;
 
   // Set two-pass stats configuration.
   oxcf->twopass_stats_in = cfg->rc_twopass_stats_in;
@@ -1209,6 +1239,12 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
   kf_cfg->enable_intrabc = extra_cfg->enable_intrabc;
 
   oxcf->speed = extra_cfg->cpu_used;
+  // TODO(yunqingwang, any) In REALTIME mode, 1080p performance at speed 5 & 6
+  // is quite bad. Force to use speed 7 for now. Will investigate it when we
+  // work on rd path optimization later.
+  if (oxcf->mode == REALTIME && AOMMIN(cfg->g_w, cfg->g_h) >= 1080 &&
+      oxcf->speed < 7)
+    oxcf->speed = 7;
 
   // Set Color related configuration.
   color_cfg->color_primaries = extra_cfg->color_primaries;
@@ -1270,10 +1306,10 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
   tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
   tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
   for (int i = 0; i < tile_cfg->tile_width_count; i++) {
-    tile_cfg->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
+    tile_cfg->tile_widths[i] = cfg->tile_widths[i];
   }
   for (int i = 0; i < tile_cfg->tile_height_count; i++) {
-    tile_cfg->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
+    tile_cfg->tile_heights[i] = cfg->tile_heights[i];
   }
   tile_cfg->enable_ext_tile_debug = extra_cfg->ext_tile_debug;
 
@@ -1418,6 +1454,10 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
 
   oxcf->strict_level_conformance = extra_cfg->strict_level_conformance;
 
+  oxcf->kf_max_pyr_height = extra_cfg->kf_max_pyr_height;
+
+  oxcf->sb_qp_sweep = extra_cfg->sb_qp_sweep;
+
   return AOM_CODEC_OK;
 }
 
@@ -2400,6 +2440,13 @@ static aom_codec_err_t ctrl_enable_sb_multipass_unit_test(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_enable_sb_qp_sweep(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sb_qp_sweep = CAST(AV1E_ENABLE_SB_QP_SWEEP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx,
                                                    va_list args) {
   AV1_COMP *const cpi = ctx->ppi->cpi;
@@ -2419,6 +2466,17 @@ static aom_codec_err_t ctrl_set_loopfilter_control(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_skip_postproc_filtering(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  // Skipping the application of post-processing filters is allowed only
+  // for ALLINTRA mode.
+  if (ctx->cfg.g_usage != AOM_USAGE_ALL_INTRA) return AOM_CODEC_INCAPABLE;
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.skip_postproc_filtering =
+      CAST(AV1E_SET_SKIP_POSTPROC_FILTERING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_rtc_external_rc(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   ctx->ppi->cpi->rc.rtc_external_ratectrl =
@@ -2547,7 +2605,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
       priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
       priv->timestamp_ratio.num =
           (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC;
-      av1_reduce_ratio(&priv->timestamp_ratio);
+      reduce_ratio(&priv->timestamp_ratio);
 
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
       if (priv->oxcf.rc_cfg.mode != AOM_CBR &&
@@ -2698,6 +2756,25 @@ static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
   return flags;
 }
 
+static INLINE int get_src_border_in_pixels(AV1_COMP *cpi, BLOCK_SIZE sb_size) {
+  if (cpi->oxcf.mode != REALTIME || av1_is_resize_needed(&cpi->oxcf))
+    return cpi->oxcf.border_in_pixels;
+
+  const int sb_size_in_pixels_log2 = mi_size_wide_log2[sb_size] + MI_SIZE_LOG2;
+  const int sb_aligned_width =
+      ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.width, sb_size_in_pixels_log2);
+  const int sb_aligned_height =
+      ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.height, sb_size_in_pixels_log2);
+  // Align the border pixels to a multiple of 32.
+  const int border_pixels_width =
+      ALIGN_POWER_OF_TWO(sb_aligned_width - cpi->oxcf.frm_dim_cfg.width, 5);
+  const int border_pixels_height =
+      ALIGN_POWER_OF_TWO(sb_aligned_height - cpi->oxcf.frm_dim_cfg.height, 5);
+  const int border_in_pixels =
+      AOMMAX(AOMMAX(border_pixels_width, border_pixels_height), 32);
+  return border_in_pixels;
+}
+
 // TODO(Mufaddal): Check feasibility of abstracting functions related to LAP
 // into a separate function.
 static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
@@ -2724,9 +2801,10 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == AOM_CODEC_OK) {
-      const size_t uncompressed_frame_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
-                                           ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) *
-                                           av1_get_image_bps(img) / 8;
+      const size_t uncompressed_frame_sz =
+          ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) *
+          ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) *
+          av1_get_image_bps(img) / 8;
 
       // Due to the presence of no-show frames, the ctx->cx_data buffer holds
       // compressed data corresponding to multiple frames. As no-show frames are
@@ -2816,6 +2894,10 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   if (res == AOM_CODEC_OK) {
     AV1_COMP *cpi = ppi->cpi;
 
+    const int num_layers =
+        cpi->svc.number_spatial_layers * cpi->svc.number_temporal_layers;
+    if (!av1_alloc_layer_context(cpi, num_layers)) return AOM_CODEC_MEM_ERROR;
+
     // Set up internal flags
     if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) ppi->b_calculate_psnr = 1;
 
@@ -2856,10 +2938,11 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
           ppi->parallel_cpi[i]->oxcf.border_in_pixels = oxcf->border_in_pixels;
         }
 
+        const int src_border_in_pixels = get_src_border_in_pixels(cpi, sb_size);
         ppi->lookahead = av1_lookahead_init(
             cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
             subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames,
-            cpi->oxcf.border_in_pixels, cpi->common.features.byte_alignment,
+            src_border_in_pixels, cpi->common.features.byte_alignment,
             ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0),
             cpi->oxcf.tool_cfg.enable_global_motion);
       }
@@ -2879,7 +2962,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       // key frame flag when we actually encode this frame.
       if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
                                 src_time_stamp, src_end_time_stamp)) {
-        res = update_error_state(ctx, &ppi->error);
+        res = update_error_state(ctx, cpi->common.error);
       }
       ctx->next_frame_flags = 0;
     }
@@ -3155,6 +3238,8 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
 
 static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
+  if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering)
+    return AOM_CODEC_INCAPABLE;
   av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
 
   if (frame != NULL) {
@@ -3170,6 +3255,8 @@ static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
 
 static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
+  if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering)
+    return AOM_CODEC_INCAPABLE;
   av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
 
   if (frame != NULL) {
@@ -3286,7 +3373,7 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
   if (mode) {
     const int res = av1_set_internal_size(
         &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
-        (AOM_SCALING)mode->h_scaling_mode, (AOM_SCALING)mode->v_scaling_mode);
+        mode->h_scaling_mode, mode->v_scaling_mode);
     av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
     return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
   } else {
@@ -3336,6 +3423,10 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
   if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) {
     unsigned int sl, tl;
     ctx->ppi->use_svc = 1;
+    const int num_layers =
+        ppi->number_spatial_layers * ppi->number_temporal_layers;
+    if (!av1_alloc_layer_context(cpi, num_layers)) return AOM_CODEC_MEM_ERROR;
+
     for (sl = 0; sl < ppi->number_spatial_layers; ++sl) {
       for (tl = 0; tl < ppi->number_temporal_layers; ++tl) {
         const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers);
@@ -3370,13 +3461,13 @@ static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx,
   AV1_COMP *const cpi = ctx->ppi->cpi;
   aom_svc_ref_frame_config_t *const data =
       va_arg(args, aom_svc_ref_frame_config_t *);
-  cpi->svc.set_ref_frame_config = 1;
+  cpi->ppi->rtc_ref.set_ref_frame_config = 1;
   for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    cpi->svc.reference[i] = data->reference[i];
-    cpi->svc.ref_idx[i] = data->ref_idx[i];
+    cpi->ppi->rtc_ref.reference[i] = data->reference[i];
+    cpi->ppi->rtc_ref.ref_idx[i] = data->ref_idx[i];
   }
   for (unsigned int i = 0; i < REF_FRAMES; ++i)
-    cpi->svc.refresh[i] = data->refresh[i];
+    cpi->ppi->rtc_ref.refresh[i] = data->refresh[i];
   cpi->svc.use_flexible_mode = 1;
   cpi->svc.ksvc_fixed_mode = 0;
   return AOM_CODEC_OK;
@@ -3387,9 +3478,9 @@ static aom_codec_err_t ctrl_set_svc_ref_frame_comp_pred(
   AV1_COMP *const cpi = ctx->ppi->cpi;
   aom_svc_ref_frame_comp_pred_t *const data =
       va_arg(args, aom_svc_ref_frame_comp_pred_t *);
-  cpi->svc.ref_frame_comp[0] = data->use_comp_pred[0];
-  cpi->svc.ref_frame_comp[1] = data->use_comp_pred[1];
-  cpi->svc.ref_frame_comp[2] = data->use_comp_pred[2];
+  cpi->ppi->rtc_ref.ref_frame_comp[0] = data->use_comp_pred[0];
+  cpi->ppi->rtc_ref.ref_frame_comp[1] = data->use_comp_pred[1];
+  cpi->ppi->rtc_ref.ref_frame_comp[2] = data->use_comp_pred[2];
   return AOM_CODEC_OK;
 }
 
@@ -3896,6 +3987,20 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
                               &g_av1_codec_arg_defs.strict_level_conformance,
                               argv, err_string)) {
     extra_cfg.strict_level_conformance = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sb_qp_sweep, argv,
+                              err_string)) {
+    extra_cfg.sb_qp_sweep = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.kf_max_pyr_height,
+                              argv, err_string)) {
+    extra_cfg.kf_max_pyr_height = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_width, argv,
+                              err_string)) {
+    ctx->cfg.tile_width_count = arg_parse_list_helper(
+        &arg, ctx->cfg.tile_widths, MAX_TILE_WIDTHS, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_height, argv,
+                              err_string)) {
+    ctx->cfg.tile_height_count = arg_parse_list_helper(
+        &arg, ctx->cfg.tile_heights, MAX_TILE_HEIGHTS, err_string);
   } else {
     match = 0;
     snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot find aom option %s",
@@ -4077,10 +4182,12 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SVC_REF_FRAME_COMP_PRED, ctrl_set_svc_ref_frame_comp_pred },
   { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap },
   { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
+  { AV1E_ENABLE_SB_QP_SWEEP, ctrl_enable_sb_qp_sweep },
   { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq },
   { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition },
   { AV1E_SET_ENABLE_TX_SIZE_SEARCH, ctrl_set_enable_tx_size_search },
   { AV1E_SET_LOOPFILTER_CONTROL, ctrl_set_loopfilter_control },
+  { AV1E_SET_SKIP_POSTPROC_FILTERING, ctrl_set_skip_postproc_filtering },
   { AV1E_SET_AUTO_INTRA_TOOLS_OFF, ctrl_set_auto_intra_tools_off },
   { AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc },
 
diff --git a/av1/av1_cx_iface.h b/av1/av1_cx_iface.h
index f47fedd8a..05f4901af 100644
--- a/av1/av1_cx_iface.h
+++ b/av1/av1_cx_iface.h
@@ -39,10 +39,6 @@ void av1_destroy_context_and_bufferpool(AV1_COMP *cpi,
 
 int av1_get_image_bps(const aom_image_t *img);
 
-void av1_reduce_ratio(aom_rational64_t *ratio);
-
-int av1_gcd(int64_t a, int b);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index cb5c6e374..809268f16 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -115,16 +115,18 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) {
 static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
   if (ctx->frame_worker != NULL) {
     AVxWorker *const worker = ctx->frame_worker;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    AV1Decoder *const pbi = frame_worker_data->pbi;
     aom_get_worker_interface()->end(worker);
-    aom_free(pbi->common.tpl_mvs);
-    pbi->common.tpl_mvs = NULL;
-    av1_remove_common(&frame_worker_data->pbi->common);
-    av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync);
-    av1_free_cdef_sync(&pbi->cdef_sync);
-    av1_free_restoration_buffers(&pbi->common);
-    av1_decoder_remove(pbi);
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    if (frame_worker_data != NULL && frame_worker_data->pbi != NULL) {
+      AV1Decoder *const pbi = frame_worker_data->pbi;
+      aom_free(pbi->common.tpl_mvs);
+      pbi->common.tpl_mvs = NULL;
+      av1_remove_common(&frame_worker_data->pbi->common);
+      av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync);
+      av1_free_cdef_sync(&pbi->cdef_sync);
+      av1_free_restoration_buffers(&pbi->common);
+      av1_decoder_remove(pbi);
+    }
     aom_free(frame_worker_data);
 #if CONFIG_MULTITHREAD
     pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
@@ -745,8 +747,8 @@ static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx,
                                         aom_film_grain_t *grain_params) {
   if (!grain_params->apply_grain) return img;
 
-  const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
-  const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+  const int w_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_w, 1);
+  const int h_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_h, 1);
 
   BufferPool *const pool = ctx->buffer_pool;
   aom_codec_frame_buffer_t *fb =
diff --git a/av1/av1_iface_common.h b/av1/av1_iface_common.h
index 57dd1b8ed..b923c3dcf 100644
--- a/av1/av1_iface_common.h
+++ b/av1/av1_iface_common.h
@@ -137,7 +137,7 @@ static AOM_INLINE aom_codec_err_t image2yuvconfig(const aom_image_t *img,
   // Note(yunqing): if img is allocated the same as the frame buffer, y_stride
   // is 32-byte aligned. Also, handle the cases while allocating img without a
   // border or stride_align is less than 32.
-  int border = (yv12->y_stride - (int)((img->w + 31) & ~31)) / 2;
+  int border = (yv12->y_stride - (int)((img->w + 31) & ~31u)) / 2;
   yv12->border = (border < 0) ? 0 : border;
   yv12->subsampling_x = img->x_chroma_shift;
   yv12->subsampling_y = img->y_chroma_shift;
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 5cf6c0fa7..677078d5b 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -28,8 +28,8 @@ int av1_get_MBs(int width, int height) {
   const int mi_cols = aligned_width >> MI_SIZE_LOG2;
   const int mi_rows = aligned_height >> MI_SIZE_LOG2;
 
-  const int mb_cols = (mi_cols + 2) >> 2;
-  const int mb_rows = (mi_rows + 2) >> 2;
+  const int mb_cols = ROUND_POWER_OF_TWO(mi_cols, 2);
+  const int mb_rows = ROUND_POWER_OF_TWO(mi_rows, 2);
   return mb_rows * mb_cols;
 }
 
@@ -372,14 +372,19 @@ void av1_free_above_context_buffers(CommonContexts *above_contexts) {
 
   for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
     for (i = 0; i < num_planes; i++) {
+      if (above_contexts->entropy[i] == NULL) break;
       aom_free(above_contexts->entropy[i][tile_row]);
       above_contexts->entropy[i][tile_row] = NULL;
     }
-    aom_free(above_contexts->partition[tile_row]);
-    above_contexts->partition[tile_row] = NULL;
+    if (above_contexts->partition != NULL) {
+      aom_free(above_contexts->partition[tile_row]);
+      above_contexts->partition[tile_row] = NULL;
+    }
 
-    aom_free(above_contexts->txfm[tile_row]);
-    above_contexts->txfm[tile_row] = NULL;
+    if (above_contexts->txfm != NULL) {
+      aom_free(above_contexts->txfm[tile_row]);
+      above_contexts->txfm[tile_row] = NULL;
+    }
   }
   for (i = 0; i < num_planes; i++) {
     aom_free(above_contexts->entropy[i]);
@@ -397,7 +402,7 @@ void av1_free_above_context_buffers(CommonContexts *above_contexts) {
 }
 
 void av1_free_context_buffers(AV1_COMMON *cm) {
-  cm->mi_params.free_mi(&cm->mi_params);
+  if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params);
 
   av1_free_above_context_buffers(&cm->above_contexts);
 }
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index bee496a49..1628cbf23 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -250,8 +250,7 @@ static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
 
 static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
                                        const int16_t c2, const int16_t c3) {
-  int16x4_t val = vdup_n_s16((int16_t)0);
-  val = vset_lane_s16(c0, val, 0);
+  int16x4_t val = vdup_n_s16(c0);
   val = vset_lane_s16(c1, val, 1);
   val = vset_lane_s16(c2, val, 2);
   val = vset_lane_s16(c3, val, 3);
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 7a8fed50f..4397a47e7 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -14,15 +14,15 @@
 #include "av1/common/cdef_block_simd.h"
 
 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
-                                        const uint8_t *src, int sstride, int v,
-                                        int h) {
+                                        const uint8_t *src, int sstride,
+                                        int width, int height) {
   int j;
-  for (int i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
+  for (int i = 0; i < height; i++) {
+    for (j = 0; j < (width & ~0x7); j += 8) {
       v64 row = v64_load_unaligned(&src[i * sstride + j]);
       v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
     }
-    for (; j < h; j++) {
+    for (; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index f0e4bedcc..012b3f78c 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -27,68 +27,41 @@ static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x4_t s6, const int16x4_t s7,
-                                      const int16_t *filter) {
+                                      const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
 
-  sum = vmul_n_s16(s0, filter[0]);
-  sum = vmla_n_s16(sum, s1, filter[1]);
-  sum = vmla_n_s16(sum, s2, filter[2]);
-  sum = vmla_n_s16(sum, s5, filter[5]);
-  sum = vmla_n_s16(sum, s6, filter[6]);
-  sum = vmla_n_s16(sum, s7, filter[7]);
-  /* filter[3] can take a max value of 128. So the max value of the result :
-   * 128*255 + sum > 16 bits
-   */
-  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
-  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
 
   return sum;
 }
 
-static INLINE uint8x8_t convolve8_horiz_8x8(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
-    const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
-  int16x8_t sum;
-
-  sum = vmulq_n_s16(s0, filter[0]);
-  sum = vmlaq_n_s16(sum, s1, filter[1]);
-  sum = vmlaq_n_s16(sum, s2, filter[2]);
-  sum = vmlaq_n_s16(sum, s5, filter[5]);
-  sum = vmlaq_n_s16(sum, s6, filter[6]);
-  sum = vmlaq_n_s16(sum, s7, filter[7]);
-  /* filter[3] can take a max value of 128. So the max value of the result :
-   * 128*255 + sum > 16 bits
-   */
-  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
-  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
-
-  sum = vqrshlq_s16(sum, shift_round_0);
-  sum = vqrshlq_s16(sum, shift_by_bits);
-
-  return vqmovun_s16(sum);
-}
-
 #if !defined(__aarch64__)
 static INLINE uint8x8_t convolve8_horiz_4x1(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
     const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
 
-  sum = vmul_n_s16(s0, filter[0]);
-  sum = vmla_n_s16(sum, s1, filter[1]);
-  sum = vmla_n_s16(sum, s2, filter[2]);
-  sum = vmla_n_s16(sum, s5, filter[5]);
-  sum = vmla_n_s16(sum, s6, filter[6]);
-  sum = vmla_n_s16(sum, s7, filter[7]);
-  /* filter[3] can take a max value of 128. So the max value of the result :
-   * 128*255 + sum > 16 bits
-   */
-  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
-  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
 
   sum = vqrshl_s16(sum, shift_round_0);
   sum = vqrshl_s16(sum, shift_by_bits);
@@ -100,80 +73,161 @@ static INLINE uint8x8_t convolve8_horiz_4x1(
 static INLINE uint8x8_t convolve8_vert_8x4(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
 
-  sum = vmulq_n_s16(s0, filter[0]);
-  sum = vmlaq_n_s16(sum, s1, filter[1]);
-  sum = vmlaq_n_s16(sum, s2, filter[2]);
-  sum = vmlaq_n_s16(sum, s5, filter[5]);
-  sum = vmlaq_n_s16(sum, s6, filter[6]);
-  sum = vmlaq_n_s16(sum, s7, filter[7]);
-  /* filter[3] can take a max value of 128. So the max value of the result :
-   * 128*255 + sum > 16 bits
-   */
-  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
-  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
-
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE uint16x4_t convolve8_vert_4x4_s32(
+static INLINE int16x4_t convolve8_vert_4x4_s32(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
     const int32x4_t round_shift_vec, const int32x4_t offset_const,
     const int32x4_t sub_const_vec) {
-  int32x4_t sum0;
-  uint16x4_t res;
-  const int32x4_t zero = vdupq_n_s32(0);
-
-  sum0 = vmull_n_s16(s0, y_filter[0]);
-  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
-  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
-  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
-  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
-  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
-  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
-  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+  int32x4_t sum;
+
+  sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+  sum = vaddq_s32(sum, offset_const);
+  sum = vqrshlq_s32(sum, round_shift_vec);
+  sum = vsubq_s32(sum, sub_const_vec);
+
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+  int32x4_t sum0, sum1;
+  int16x8_t res;
+
+  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
 
   sum0 = vaddq_s32(sum0, offset_const);
+  sum1 = vaddq_s32(sum1, offset_const);
   sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
   sum0 = vsubq_s32(sum0, sub_const_vec);
-  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vsubq_s32(sum1, sub_const_vec);
 
-  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+  res = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  res = vqrshlq_s16(res, vec_round_bits);
 
-  return res;
+  return vqmovun_s16(res);
 }
 
-static INLINE uint8x8_t convolve8_vert_8x4_s32(
+static INLINE int16x4_t convolve12_vert_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+  int32x4_t sum;
+
+  sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+  sum = vaddq_s32(sum, offset_const);
+  sum = vqrshlq_s32(sum, round_shift_vec);
+  sum = vsubq_s32(sum, sub_const_vec);
+
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve12_vert_8x4_s32(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
     const int32x4_t round_shift_vec, const int32x4_t offset_const,
     const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
   int32x4_t sum0, sum1;
-  uint16x8_t res;
-  const int32x4_t zero = vdupq_n_s32(0);
-
-  sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
-
-  sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+  int16x8_t res;
+
+  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
 
   sum0 = vaddq_s32(sum0, offset_const);
   sum1 = vaddq_s32(sum1, offset_const);
@@ -181,14 +235,329 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32(
   sum1 = vqrshlq_s32(sum1, round_shift_vec);
   sum0 = vsubq_s32(sum0, sub_const_vec);
   sum1 = vsubq_s32(sum1, sub_const_vec);
-  sum0 = vmaxq_s32(sum0, zero);
-  sum1 = vmaxq_s32(sum1, zero);
-  res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
-                     vqmovn_u32(vreinterpretq_u32_s32(sum1)));
 
-  res = vqrshlq_u16(res, vec_round_bits);
+  res = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  res = vqrshlq_s16(res, vec_round_bits);
+
+  return vqmovun_s16(res);
+}
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const int subpel_x_qn,
+                            ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    return;
+  }
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+  src -= horiz_offset;
+
+  if (w <= 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x8_t t01, t23;
+    uint8x8_t d01, d23;
+
+    do {
+      s0 = vld1q_u8(src + 0 * src_stride);
+      s1 = vld1q_u8(src + 1 * src_stride);
+      s2 = vld1q_u8(src + 2 * src_stride);
+      s3 = vld1q_u8(src + 3 * src_stride);
+
+      t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, vdupq_n_s32(0));
+      t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, vdupq_n_s32(0));
+      t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, vdupq_n_s32(0));
+      t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, vdupq_n_s32(0));
+
+      t01 = vcombine_s16(vmovn_s32(t0), vmovn_s32(t1));
+      t23 = vcombine_s16(vmovn_s32(t2), vmovn_s32(t3));
+
+      t01 = vqrshlq_s16(t01, shift_round_0);
+      t23 = vqrshlq_s16(t23, shift_round_0);
+
+      t01 = vqrshlq_s16(t01, shift_by_bits);
+      t23 = vqrshlq_s16(t23, shift_by_bits);
+
+      d01 = vqmovun_s16(t01);
+      d23 = vqmovun_s16(t23);
+
+      if (w == 2) {
+        vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),
+                      vreinterpret_u16_u8(d01), 0);
+        vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),
+                      vreinterpret_u16_u8(d01), 2);
+        if (h != 2) {
+          vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),
+                        vreinterpret_u16_u8(d23), 0);
+          vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),
+                        vreinterpret_u16_u8(d23), 2);
+        }
+      } else {
+        vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
+                      vreinterpret_u32_u8(d01), 0);
+        vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
+                      vreinterpret_u32_u8(d01), 1);
+        if (h != 2) {
+          vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
+                        vreinterpret_u32_u8(d23), 0);
+          vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
+                        vreinterpret_u32_u8(d23), 1);
+        }
+      }
+
+      h -= 4;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+    } while (h > 0);
+
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t t0, t1, t2, t3;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        t0 = convolve8_8_usdot(s0, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+        t1 = convolve8_8_usdot(s1, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+        t2 = convolve8_8_usdot(s2, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+        t3 = convolve8_8_usdot(s3, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+
+        t0 = vqrshlq_s16(t0, shift_by_bits);
+        t1 = vqrshlq_s16(t1, shift_by_bits);
+        t2 = vqrshlq_s16(t2, shift_by_bits);
+        t3 = vqrshlq_s16(t3, shift_by_bits);
+
+        d0 = vqmovun_s16(t0);
+        d1 = vqmovun_s16(t1);
+        d2 = vqmovun_s16(t2);
+        d3 = vqmovun_s16(t3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        if (h != 2) {
+          vst1_u8(d + 2 * dst_stride, d2);
+          vst1_u8(d + 3 * dst_stride, d3);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const int subpel_x_qn,
+                            ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    return;
+  }
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Dot product constants.
+  const int16x8_t correct_tmp = vshll_n_s8(x_filter, 7);
+  const int32x4_t correction = vdupq_n_s32(vaddlvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+  src -= horiz_offset;
+
+  if (w <= 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x8_t t01, t23;
+    uint8x8_t d01, d23;
+
+    do {
+      s0 = vld1q_u8(src + 0 * src_stride);
+      s1 = vld1q_u8(src + 1 * src_stride);
+      s2 = vld1q_u8(src + 2 * src_stride);
+      s3 = vld1q_u8(src + 3 * src_stride);
+
+      t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
+
+      t01 = vcombine_s16(vmovn_s32(t0), vmovn_s32(t1));
+      t23 = vcombine_s16(vmovn_s32(t2), vmovn_s32(t3));
+
+      t01 = vqrshlq_s16(t01, shift_round_0);
+      t23 = vqrshlq_s16(t23, shift_round_0);
+
+      t01 = vqrshlq_s16(t01, shift_by_bits);
+      t23 = vqrshlq_s16(t23, shift_by_bits);
+
+      d01 = vqmovun_s16(t01);
+      d23 = vqmovun_s16(t23);
+
+      if (w == 2) {
+        vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),
+                      vreinterpret_u16_u8(d01), 0);
+        vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),
+                      vreinterpret_u16_u8(d01), 2);
+        if (h != 2) {
+          vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),
+                        vreinterpret_u16_u8(d23), 0);
+          vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),
+                        vreinterpret_u16_u8(d23), 2);
+        }
+      } else {
+        vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
+                      vreinterpret_u32_u8(d01), 0);
+        vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
+                      vreinterpret_u32_u8(d01), 1);
+        if (h != 2) {
+          vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
+                        vreinterpret_u32_u8(d23), 0);
+          vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
+                        vreinterpret_u32_u8(d23), 1);
+        }
+      }
+
+      h -= 4;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+    } while (h > 0);
+
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t t0, t1, t2, t3;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        t0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        t1 = convolve8_8_sdot(s1, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        t2 = convolve8_8_sdot(s2, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        t3 = convolve8_8_sdot(s3, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+
+        t0 = vqrshlq_s16(t0, shift_by_bits);
+        t1 = vqrshlq_s16(t1, shift_by_bits);
+        t2 = vqrshlq_s16(t2, shift_by_bits);
+        t3 = vqrshlq_s16(t3, shift_by_bits);
+
+        d0 = vqmovun_s16(t0);
+        d1 = vqmovun_s16(t1);
+        d2 = vqmovun_s16(t2);
+        d3 = vqmovun_s16(t3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        if (h != 2) {
+          vst1_u8(d + 2 * dst_stride, d2);
+          vst1_u8(d + 3 * dst_stride, d3);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
-  return vqmovn_u16(res);
+static INLINE uint8x8_t convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  sum = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(sum);
 }
 
 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -213,10 +582,12 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  // Filter values are even so downshift by 1 to reduce precision requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
 
   src -= horiz_offset;
@@ -609,6 +980,8 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 #endif
 }
 
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_y,
@@ -622,8 +995,10 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
   src -= vert_offset * src_stride;
 
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  // Filter values are even so downshift by 1 to reduce precision requirements.
+  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
 
   if (w <= 4) {
     uint8x8_t d01;
@@ -671,8 +1046,8 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
       if ((w == 4) && (h != 2)) {
         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
                       0);  // 00 01 02 03
@@ -722,7 +1097,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS - 1);
 
       if (w == 4) {
         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
@@ -849,11 +1224,1211 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int16x4_t convolve12_4_usdot(uint8x16_t samples,
+                                           const int8x16_t filters,
+                                           const uint8x16x3_t permute_tbl,
+                                           const int32x4_t horiz_const,
+                                           const int32x4_t shift_round_0) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+  /* Narrow and re-pack. */
+  sum = vqrshlq_s32(sum, shift_round_0);
+
+  return vmovn_s32(sum);
+}
+
+static INLINE int16x8_t convolve12_8_usdot(uint8x16_t samples0,
+                                           uint8x16_t samples1,
+                                           const int8x16_t filters,
+                                           const uint8x16x3_t permute_tbl,
+                                           const int32x4_t horiz_const,
+                                           const int32x4_t shift_round_0) {
+  uint8x16_t permuted_samples[4];
+  int32x4_t sum[2];
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples0, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples0, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples0, permute_tbl.val[2]);
+  /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
+  permuted_samples[3] = vqtbl1q_u8(samples1, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+  /* Second 4 output values. */
+  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+  /* Narrow and re-pack. */
+  sum[0] = vqrshlq_s32(sum[0], shift_round_0);
+  sum[1] = vqrshlq_s32(sum[1], shift_round_0);
+
+  return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+}
+
+static INLINE void av1_convolve_2d_sr_horiz_12tap_neon(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11, const int round_0) {
+  const int bd = 8;
+
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd - 1)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(FILTER_BITS - round_0);
+    // Undo the horizontal offset in the calling function.
+    src_ptr += 5;
+
+    for (int i = 0; i < h; i++) {
+      for (int j = 0; j < w; j += 8) {
+        uint8x8_t s0 = vld1_u8(src_ptr + i * src_stride + j);
+        uint16x8_t t0 = vaddw_u8(vreinterpretq_u16_s16(horiz_const), s0);
+        int16x8_t d0 = vqrshlq_s16(vreinterpretq_s16_u16(t0), shift_round_0);
+        if (w == 2) {
+          vst1q_lane_s32((int32_t *)(dst_ptr + i * dst_stride),
+                         vreinterpretq_s32_s16(d0), 0);
+        } else if (w == 4) {
+          vst1_s16(dst_ptr + i * dst_stride, vget_low_s16(d0));
+        } else {
+          vst1q_s16(dst_ptr + i * dst_stride + j, d0);
+        }
+      }
+    }
+  } else {
+    // Narrow filter values to 8-bit.
+    const int16x8x2_t x_filter_s16 = {
+      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+    };
+    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                           vmovn_s16(x_filter_s16.val[1]));
+
+    const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)));
+    const int32x4_t shift_round_0 = vdupq_n_s32(-round_0);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    if (w <= 4) {
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0, s1, s2, s3;
+          int16x4_t d0, d1, d2, d3;
+
+          s0 = vld1q_u8(s + 0 * src_stride);
+          s1 = vld1q_u8(s + 1 * src_stride);
+          s2 = vld1q_u8(s + 2 * src_stride);
+          s3 = vld1q_u8(s + 3 * src_stride);
+
+          d0 = convolve12_4_usdot(s0, x_filter, permute_tbl, horiz_const,
+                                  shift_round_0);
+          d1 = convolve12_4_usdot(s1, x_filter, permute_tbl, horiz_const,
+                                  shift_round_0);
+          d2 = convolve12_4_usdot(s2, x_filter, permute_tbl, horiz_const,
+                                  shift_round_0);
+          d3 = convolve12_4_usdot(s3, x_filter, permute_tbl, horiz_const,
+                                  shift_round_0);
+
+          if (w == 2) {
+            vst1_lane_s32((int32_t *)(d + 0 * dst_stride),
+                          vreinterpret_s32_s16(d0), 0);
+            vst1_lane_s32((int32_t *)(d + 1 * dst_stride),
+                          vreinterpret_s32_s16(d1), 0);
+            vst1_lane_s32((int32_t *)(d + 2 * dst_stride),
+                          vreinterpret_s32_s16(d2), 0);
+            vst1_lane_s32((int32_t *)(d + 3 * dst_stride),
+                          vreinterpret_s32_s16(d3), 0);
+          } else {
+            vst1_s16(d + 0 * dst_stride, d0);
+            vst1_s16(d + 1 * dst_stride, d1);
+            vst1_s16(d + 2 * dst_stride, d2);
+            vst1_s16(d + 3 * dst_stride, d3);
+          }
+
+          s += 4;
+          d += 4;
+          width -= 4;
+        } while (width > 0);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h >= 4);
+
+      for (; h > 0; h--) {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0;
+          int16x4_t d0;
+
+          s0 = vld1q_u8(s);
+
+          d0 = convolve12_4_usdot(s0, x_filter, permute_tbl, horiz_const,
+                                  shift_round_0);
+
+          if (w == 2) {
+            vst1_lane_s32((int32_t *)d, vreinterpret_s32_s16(d0), 0);
+          } else {
+            vst1_s16(d, d0);
+          }
+
+          s += 4;
+          d += 4;
+          width -= 4;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      }
+    } else {
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2], s1[2], s2[2], s3[2];
+          int16x8_t d0, d1, d2, d3;
+
+          s0[0] = vld1q_u8(s + 0 * src_stride);
+          s1[0] = vld1q_u8(s + 1 * src_stride);
+          s2[0] = vld1q_u8(s + 2 * src_stride);
+          s3[0] = vld1q_u8(s + 3 * src_stride);
+          s0[1] = vld1q_u8(s + 0 * src_stride + 4);
+          s1[1] = vld1q_u8(s + 1 * src_stride + 4);
+          s2[1] = vld1q_u8(s + 2 * src_stride + 4);
+          s3[1] = vld1q_u8(s + 3 * src_stride + 4);
+
+          d0 = convolve12_8_usdot(s0[0], s0[1], x_filter, permute_tbl,
+                                  horiz_const, shift_round_0);
+          d1 = convolve12_8_usdot(s1[0], s1[1], x_filter, permute_tbl,
+                                  horiz_const, shift_round_0);
+          d2 = convolve12_8_usdot(s2[0], s2[1], x_filter, permute_tbl,
+                                  horiz_const, shift_round_0);
+          d3 = convolve12_8_usdot(s3[0], s3[1], x_filter, permute_tbl,
+                                  horiz_const, shift_round_0);
+
+          vst1q_s16(d + 0 * dst_stride, d0);
+          vst1q_s16(d + 1 * dst_stride, d1);
+          vst1q_s16(d + 2 * dst_stride, d2);
+          vst1q_s16(d + 3 * dst_stride, d3);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h >= 4);
+
+      for (; h > 0; h--) {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2];
+          int16x8_t d0;
+
+          s0[0] = vld1q_u8(s);
+          s0[1] = vld1q_u8(s + 4);
+
+          d0 = convolve12_8_usdot(s0[0], s0[1], x_filter, permute_tbl,
+                                  horiz_const, shift_round_0);
+
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      }
+    }
+  }
+}
+
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE int16x4_t convolve12_4_sdot(uint8x16_t samples,
+                                          const int8x16_t filters,
+                                          const int32x4_t correction,
+                                          const uint8x16_t range_limit,
+                                          const uint8x16x3_t permute_tbl,
+                                          const int32x4_t shift_round_0) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+  sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+  /* Narrow and re-pack. */
+  sum = vqrshlq_s32(sum, shift_round_0);
+
+  return vmovn_s32(sum);
+}
+
+static INLINE int16x8_t convolve12_8_sdot(
+    uint8x16_t samples0, uint8x16_t samples1, const int8x16_t filters,
+    const int32x4_t correction, const uint8x16_t range_limit,
+    const uint8x16x3_t permute_tbl, const int32x4_t shift_round_0) {
+  int8x16_t clamped_samples[2], permuted_samples[4];
+  int32x4_t sum[2];
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples0, range_limit));
+  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples1, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+  /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
+  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+  /* Second 4 output values. */
+  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
+  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+  /* Narrow and re-pack. */
+  sum[0] = vqrshlq_s32(sum[0], shift_round_0);
+  sum[1] = vqrshlq_s32(sum[1], shift_round_0);
+
+  return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+}
+
+static INLINE void av1_convolve_2d_sr_horiz_12tap_neon(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11, const int round_0) {
+  const int bd = 8;
+
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd - 1)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(FILTER_BITS - round_0);
+    // Undo the horizontal offset in the calling function.
+    src_ptr += 5;
+
+    for (int i = 0; i < h; i++) {
+      for (int j = 0; j < w; j += 8) {
+        uint8x8_t s0 = vld1_u8(src_ptr + i * src_stride + j);
+        uint16x8_t t0 = vaddw_u8(vreinterpretq_u16_s16(horiz_const), s0);
+        int16x8_t d0 = vqrshlq_s16(vreinterpretq_s16_u16(t0), shift_round_0);
+        if (w == 2) {
+          vst1q_lane_s32((int32_t *)(dst_ptr + i * dst_stride),
+                         vreinterpretq_s32_s16(d0), 0);
+        } else if (w == 4) {
+          vst1_s16(dst_ptr + i * dst_stride, vget_low_s16(d0));
+        } else {
+          vst1q_s16(dst_ptr + i * dst_stride + j, d0);
+        }
+      }
+    }
+  } else {
+    const int32x4_t shift_round_0 = vdupq_n_s32(-round_0);
+
+    // Narrow filter values to 8-bit.
+    const int16x8x2_t x_filter_s16 = {
+      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+    };
+    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                           vmovn_s16(x_filter_s16.val[1]));
+
+    // Dot product constants.
+    const int32_t horiz_const = (1 << (bd + FILTER_BITS - 1));
+    const int32x4_t correct_tmp =
+        vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
+                  vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
+    const int32x4_t correction =
+        vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
+    const uint8x16_t range_limit = vdupq_n_u8(128);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    if (w <= 4) {
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0, s1, s2, s3;
+          int16x4_t d0, d1, d2, d3;
+
+          s0 = vld1q_u8(s + 0 * src_stride);
+          s1 = vld1q_u8(s + 1 * src_stride);
+          s2 = vld1q_u8(s + 2 * src_stride);
+          s3 = vld1q_u8(s + 3 * src_stride);
+
+          d0 = convolve12_4_sdot(s0, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+          d1 = convolve12_4_sdot(s1, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+          d2 = convolve12_4_sdot(s2, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+          d3 = convolve12_4_sdot(s3, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+
+          if (w == 2) {
+            vst1_lane_s32((int32_t *)(d + 0 * dst_stride),
+                          vreinterpret_s32_s16(d0), 0);
+            vst1_lane_s32((int32_t *)(d + 1 * dst_stride),
+                          vreinterpret_s32_s16(d1), 0);
+            vst1_lane_s32((int32_t *)(d + 2 * dst_stride),
+                          vreinterpret_s32_s16(d2), 0);
+            vst1_lane_s32((int32_t *)(d + 3 * dst_stride),
+                          vreinterpret_s32_s16(d3), 0);
+          } else {
+            vst1_s16(d + 0 * dst_stride, d0);
+            vst1_s16(d + 1 * dst_stride, d1);
+            vst1_s16(d + 2 * dst_stride, d2);
+            vst1_s16(d + 3 * dst_stride, d3);
+          }
+
+          s += 4;
+          d += 4;
+          width -= 4;
+        } while (width > 0);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h >= 4);
+
+      for (; h > 0; h--) {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0;
+          int16x4_t d0;
+
+          s0 = vld1q_u8(s);
+
+          d0 = convolve12_4_sdot(s0, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+
+          if (w == 2) {
+            vst1_lane_s32((int32_t *)d, vreinterpret_s32_s16(d0), 0);
+          } else {
+            vst1_s16(d, d0);
+          }
+
+          s += 4;
+          d += 4;
+          width -= 4;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      }
+    } else {
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2], s1[2], s2[2], s3[2];
+          int16x8_t d0, d1, d2, d3;
+
+          s0[0] = vld1q_u8(s + 0 * src_stride);
+          s1[0] = vld1q_u8(s + 1 * src_stride);
+          s2[0] = vld1q_u8(s + 2 * src_stride);
+          s3[0] = vld1q_u8(s + 3 * src_stride);
+          s0[1] = vld1q_u8(s + 0 * src_stride + 4);
+          s1[1] = vld1q_u8(s + 1 * src_stride + 4);
+          s2[1] = vld1q_u8(s + 2 * src_stride + 4);
+          s3[1] = vld1q_u8(s + 3 * src_stride + 4);
+
+          d0 = convolve12_8_sdot(s0[0], s0[1], x_filter, correction,
+                                 range_limit, permute_tbl, shift_round_0);
+          d1 = convolve12_8_sdot(s1[0], s1[1], x_filter, correction,
+                                 range_limit, permute_tbl, shift_round_0);
+          d2 = convolve12_8_sdot(s2[0], s2[1], x_filter, correction,
+                                 range_limit, permute_tbl, shift_round_0);
+          d3 = convolve12_8_sdot(s3[0], s3[1], x_filter, correction,
+                                 range_limit, permute_tbl, shift_round_0);
+
+          vst1q_s16(d + 0 * dst_stride, d0);
+          vst1q_s16(d + 1 * dst_stride, d1);
+          vst1q_s16(d + 2 * dst_stride, d2);
+          vst1q_s16(d + 3 * dst_stride, d3);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h >= 4);
+
+      for (; h > 0; h--) {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2];
+          int16x8_t d0;
+
+          s0[0] = vld1q_u8(s);
+          s0[1] = vld1q_u8(s + 4);
+
+          d0 = convolve12_8_sdot(s0[0], s0[1], x_filter, correction,
+                                 range_limit, permute_tbl, shift_round_0);
+
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      }
+    }
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
+static INLINE int16x4_t convolve12_horiz_4x4_s16(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
+    const int32x4_t horiz_const, const int32x4_t shift_round_0) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+  int32x4_t sum;
+
+  sum = horiz_const;
+  sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
+
+  sum = vqrshlq_s32(sum, shift_round_0);
+
+  return vmovn_s32(sum);
+}
+
+// 4 column per iteration horizontal filtering for 12-tap convolve_2d_sr.
+// Processes one row at a time.
+static INLINE void horiz_filter_12tap_w4_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11, const int32x4_t horiz_const,
+    const int32x4_t shift_round_0) {
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, d0;
+      uint8x16_t t0;
+      int16x8_t tt0, tt1;
+
+      t0 = vld1q_u8(s);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+
+      s0 = vget_low_s16(tt0);
+      s4 = vget_high_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s12 = vget_high_s16(tt1);
+
+      s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
+      s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
+      s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
+      s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
+      s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
+      s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
+      s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
+      s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
+      s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
+
+      d0 = convolve12_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                    s11, x_filter_0_7, x_filter_8_11,
+                                    horiz_const, shift_round_0);
+
+      if (w == 2) {
+        vst1_lane_s32((int32_t *)d, vreinterpret_s32_s16(d0), 0);
+      } else {
+        vst1_s16(d, d0);
+      }
+
+      s += 4;
+      d += 4;
+      width -= 4;
+    } while (width > 0);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    h--;
+  } while (h > 0);
+}
+
+static INLINE void av1_convolve_2d_sr_horiz_12tap_neon(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11, const int round_0) {
+  const int bd = 8;
+  const int32x4_t shift_round_0 = vdupq_n_s32(-(round_0));
+  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)));
+
+#if defined(__aarch64__)
+  do {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    uint8x8_t t0, t1, t2, t3;
+
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+    load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+    s += 11;
+
+    do {
+      int16x4_t s11, s12, s13, s14, d0, d1, d2, d3;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve12_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                    s11, x_filter_0_7, x_filter_8_11,
+                                    horiz_const, shift_round_0);
+      d1 = convolve12_horiz_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                    s11, s12, x_filter_0_7, x_filter_8_11,
+                                    horiz_const, shift_round_0);
+      d2 = convolve12_horiz_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                    s12, s13, x_filter_0_7, x_filter_8_11,
+                                    horiz_const, shift_round_0);
+      d3 = convolve12_horiz_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                    s13, s14, x_filter_0_7, x_filter_8_11,
+                                    horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+      if (w == 2) {
+        vst1_lane_s32((int32_t *)(d + 0 * dst_stride), vreinterpret_s32_s16(d0),
+                      0);
+        vst1_lane_s32((int32_t *)(d + 1 * dst_stride), vreinterpret_s32_s16(d1),
+                      0);
+        vst1_lane_s32((int32_t *)(d + 2 * dst_stride), vreinterpret_s32_s16(d2),
+                      0);
+        vst1_lane_s32((int32_t *)(d + 3 * dst_stride), vreinterpret_s32_s16(d3),
+                      0);
+      } else {
+        vst1_s16((d + 0 * dst_stride), d0);
+        vst1_s16((d + 1 * dst_stride), d1);
+        vst1_s16((d + 2 * dst_stride), d2);
+        vst1_s16((d + 3 * dst_stride), d3);
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s7 = s11;
+      s8 = s12;
+      s9 = s13;
+      s10 = s14;
+
+      s += 4;
+      d += 4;
+      width -= 4;
+    } while (width > 0);
+
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    h -= 4;
+  } while (h >= 4);
+
+  if (h) {
+    horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     w, h, x_filter_0_7, x_filter_8_11,
+                                     horiz_const, shift_round_0);
+  }
+#else   // !defined(__aarch64__)
+  horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+                                   h, x_filter_0_7, x_filter_8_11, horiz_const,
+                                   shift_round_0);
+#endif  // defined(__aarch64__)
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void av1_convolve_2d_sr_vert_12tap_neon(
+    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
+    int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+    int16x4_t d0, d1, d2, d3;
+    int16x8_t dd01, dd23;
+    uint8x8_t d01, d23;
+
+    load_s16_4x8(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    src_ptr += (8 * src_stride);
+    load_s16_4x4(src_ptr, src_stride, &s8, &s9, &s10, &s11);
+    src_ptr += (3 * src_stride);
+
+    do {
+      load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
+      src_ptr += 4 * src_stride;
+
+      d0 = convolve12_vert_4x4_s32(
+          s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
+          y_filter_8_11, round_shift_vec, offset_const, sub_const_vec);
+      d1 = convolve12_vert_4x4_s32(
+          s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
+          y_filter_8_11, round_shift_vec, offset_const, sub_const_vec);
+      d2 = convolve12_vert_4x4_s32(
+          s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
+          y_filter_8_11, round_shift_vec, offset_const, sub_const_vec);
+      d3 = convolve12_vert_4x4_s32(
+          s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
+          y_filter_8_11, round_shift_vec, offset_const, sub_const_vec);
+
+      dd01 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
+      dd23 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
+
+      d01 = vqmovun_s16(dd01);
+      d23 = vqmovun_s16(dd23);
+
+      if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst_ptr, vreinterpret_u16_u8(d01), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u16((uint16_t *)dst_ptr, vreinterpret_u16_u8(d01), 2);
+        dst_ptr += dst_stride;
+        if (h != 2) {
+          vst1_lane_u16((uint16_t *)dst_ptr, vreinterpret_u16_u8(d23), 0);
+          dst_ptr += dst_stride;
+          vst1_lane_u16((uint16_t *)dst_ptr, vreinterpret_u16_u8(d23), 2);
+          dst_ptr += dst_stride;
+        }
+      } else {
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d01), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d01), 1);
+        dst_ptr += dst_stride;
+        if (h != 2) {
+          vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d23), 0);
+          dst_ptr += dst_stride;
+          vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d23), 1);
+          dst_ptr += dst_stride;
+        }
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s7 = s11;
+      s8 = s12;
+      s9 = s13;
+      s10 = s14;
+      h -= 4;
+    } while (h > 0);
+
+  } else {
+    do {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+      uint8x8_t d0, d1, d2, d3;
+
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      int height = h;
+
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      s += (8 * src_stride);
+      load_s16_8x4(s, src_stride, &s8, &s9, &s10, &s11);
+      s += (3 * src_stride);
+
+      do {
+        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
+        s += 4 * src_stride;
+
+        d0 = convolve12_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+                                     s10, s11, y_filter_0_7, y_filter_8_11,
+                                     round_shift_vec, offset_const,
+                                     sub_const_vec, vec_round_bits);
+        d1 = convolve12_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                     s11, s12, y_filter_0_7, y_filter_8_11,
+                                     round_shift_vec, offset_const,
+                                     sub_const_vec, vec_round_bits);
+        d2 = convolve12_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                     s12, s13, y_filter_0_7, y_filter_8_11,
+                                     round_shift_vec, offset_const,
+                                     sub_const_vec, vec_round_bits);
+        d3 = convolve12_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                     s13, s14, y_filter_0_7, y_filter_8_11,
+                                     round_shift_vec, offset_const,
+                                     sub_const_vec, vec_round_bits);
+
+        vst1_u8(d, d0);
+        d += dst_stride;
+        vst1_u8(d, d1);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_u8(d, d2);
+          d += dst_stride;
+          vst1_u8(d, d3);
+          d += dst_stride;
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s7 = s11;
+        s8 = s12;
+        s9 = s13;
+        s10 = s14;
+        height -= 4;
+      } while (height > 0);
+
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE void av1_convolve_2d_sr_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16x8_t x_filter_s16, const int round_0) {
+  const int bd = 8;
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+
+  int height = im_h;
+
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+  const int32x4_t horiz_const = vdupq_n_s32(1 << (bd + FILTER_BITS - 2));
+
+  assert(round_0 > 0);
+
+  if (w <= 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      load_u8_8x16(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
+      t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, horiz_const);
+      t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
+      t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      } else {
+        vst1_s16(dst_ptr + 0 * dst_stride, d0);
+        vst1_s16(dst_ptr + 1 * dst_stride, d1);
+        vst1_s16(dst_ptr + 2 * dst_stride, d2);
+        vst1_s16(dst_ptr + 3 * dst_stride, d3);
+      }
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        s0 = vld1q_u8(src_ptr);
+        t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
+        d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+
+        if (w == 2) {
+          vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        } else {
+          vst1_s16(dst_ptr, d0);
+        }
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_usdot(s0, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+        d1 = convolve8_8_usdot(s1, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+        d2 = convolve8_8_usdot(s2, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+        d3 = convolve8_8_usdot(s3, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          s0 = vld1q_u8(s);
+          d0 = convolve8_8_usdot(s0, x_filter, permute_tbl, horiz_const,
+                                 shift_round_0);
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  }
+}
+
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void av1_convolve_2d_sr_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16x8_t x_filter_s16, const int round_0) {
+  const int bd = 8;
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+
+  int height = im_h;
+
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+  // Dot product constants.
+  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
+  const int32x4_t correction =
+      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert(round_0 > 0);
+
+  if (w <= 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      load_u8_8x16(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      } else {
+        vst1_s16(dst_ptr + 0 * dst_stride, d0);
+        vst1_s16(dst_ptr + 1 * dst_stride, d1);
+        vst1_s16(dst_ptr + 2 * dst_stride, d2);
+        vst1_s16(dst_ptr + 3 * dst_stride, d3);
+      }
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        s0 = vld1q_u8(src_ptr);
+        t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit,
+                              permute_tbl);
+        d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+
+        if (w == 2) {
+          vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        } else {
+          vst1_s16(dst_ptr, d0);
+        }
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d1 = convolve8_8_sdot(s1, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d2 = convolve8_8_sdot(s2, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d3 = convolve8_8_sdot(s3, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          s0 = vld1q_u8(s);
+          d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
+                                permute_tbl, shift_round_0);
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
 // Horizontal filtering for convolve_2d_sr for width multiple of 8
 // Processes one row at a time
 static INLINE void horiz_filter_w8_single_row(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int dst_stride, int width, int height, const int16x8_t x_filter,
     const int16x8_t horiz_const, const int16x8_t shift_round_0) {
   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
   do {
@@ -899,7 +2474,7 @@ static INLINE void horiz_filter_w8_single_row(
 // Processes one row at a time
 static INLINE void horiz_filter_w4_single_row(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int dst_stride, int width, int height, const int16x8_t x_filter,
     const int16x4_t horiz_const, const int16x4_t shift_round_0) {
   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
   do {
@@ -928,87 +2503,46 @@ static INLINE void horiz_filter_w4_single_row(
     int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                                      horiz_const, shift_round_0);
 
-    if (width == 4) {
-      vst1_s16(dst_ptr, d0);
-      dst_ptr += dst_stride;
-    } else if (width == 2) {
+    if (width == 2) {
       vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
-      dst_ptr += dst_stride;
+    } else {
+      vst1_s16(dst_ptr, d0);
     }
 
+    dst_ptr += dst_stride;
     src_ptr += src_stride;
     height--;
   } while (height > 0);
 }
 
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
-  if (filter_params_x->taps > 8) {
-    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                         filter_params_x, filter_params_y, subpel_x_qn,
-                         subpel_y_qn, conv_params);
-    return;
-  }
-  int im_dst_stride;
-  int width, height;
-#if defined(__aarch64__)
-  uint8x8_t t0;
-  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-  const uint8_t *s;
-#endif
-
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
-
+static INLINE void av1_convolve_2d_sr_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16x8_t x_filter_s16, const int round_0) {
   const int bd = 8;
-  const int im_h = h + filter_params_y->taps - 1;
-  const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-
-  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
-  int16_t *dst_ptr;
 
-  dst_ptr = im_block;
-  im_dst_stride = im_stride;
-  height = im_h;
-  width = w;
-
-  const int16_t round_bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
 
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+  int height = im_h;
 
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t x_filter = vshrq_n_s16(x_filter_s16, 1);
 
-  assert(conv_params->round_0 > 0);
+  assert(round_0 > 0);
 
   if (w <= 4) {
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
-    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
 
 #if defined(__aarch64__)
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     do {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+      uint8x8_t t0, t1, t2, t3;
+      const uint8_t *s = src_ptr;
+
       assert(height >= 4);
-      s = src_ptr;
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
 
       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
@@ -1021,10 +2555,6 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
       s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
-      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
       s += 7;
 
       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
@@ -1035,68 +2565,65 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                              horiz_const, shift_round_0);
-      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                              horiz_const, shift_round_0);
-      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                              horiz_const, shift_round_0);
-      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                              horiz_const, shift_round_0);
 
       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-      if (w == 4) {
-        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
-        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
-        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
-        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
-      } else if (w == 2) {
-        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+
+      if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * dst_stride),
                       vreinterpret_u32_s16(d0), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * dst_stride),
                       vreinterpret_u32_s16(d1), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * dst_stride),
                       vreinterpret_u32_s16(d2), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * dst_stride),
                       vreinterpret_u32_s16(d3), 0);
+      } else {
+        vst1_s16((dst_ptr + 0 * dst_stride), d0);
+        vst1_s16((dst_ptr + 1 * dst_stride), d1);
+        vst1_s16((dst_ptr + 2 * dst_stride), d2);
+        vst1_s16((dst_ptr + 3 * dst_stride), d3);
       }
+
       src_ptr += 4 * src_stride;
-      dst_ptr += 4 * im_dst_stride;
+      dst_ptr += 4 * dst_stride;
       height -= 4;
     } while (height >= 4);
 
     if (height) {
       assert(height < 4);
-      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
-                                 height, x_filter_tmp, horiz_const,
-                                 shift_round_0);
+      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+                                 height, x_filter, horiz_const, shift_round_0);
     }
-#else
-    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
-                               height, x_filter_tmp, horiz_const,
-                               shift_round_0);
-#endif
+
+#else   // !defined(__aarch64__)
+    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+                               height, x_filter, horiz_const, shift_round_0);
+#endif  // defined(__aarch64__)
 
   } else {
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
-    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
 
 #if defined(__aarch64__)
-    int16_t *d_tmp;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
-    do {
-      assert(height >= 8);
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+    for (; height >= 8; height -= 8) {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+          d0, d1, d2, d3, d4, d5, d6, d7;
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
@@ -1108,18 +2635,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-      width = w;
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-
-      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+      s += 7;
 
       do {
         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1135,28 +2651,26 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-
-        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
-                          &res7);
-
-        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
-                      res6, res7);
+        d0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                               horiz_const, shift_round_0);
+        d1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                               horiz_const, shift_round_0);
+        d2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                               horiz_const, shift_round_0);
+        d3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                               horiz_const, shift_round_0);
+        d4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                               horiz_const, shift_round_0);
+        d5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                               horiz_const, shift_round_0);
+        d6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                               horiz_const, shift_round_0);
+        d7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+                               horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
         s0 = s8;
         s1 = s9;
@@ -1166,248 +2680,262 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s5 = s13;
         s6 = s14;
         s += 8;
-        d_tmp += 8;
+        d += 8;
         width -= 8;
       } while (width > 0);
+
       src_ptr += 8 * src_stride;
-      dst_ptr += 8 * im_dst_stride;
-      height -= 8;
-    } while (height >= 8);
-
-    if (height >= 4) {
-      assert(height < 8);
-      int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
-          reg10, reg11, reg12, reg13, reg14;
-      int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
-      int16x8_t out0, out1, out2, out3;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
+      dst_ptr += 8 * dst_stride;
+    }
+
+    for (; height >= 4; height -= 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+          dd0, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
+      int16x8_t d0, d1, d2, d3;
+      uint8x8_t t0, t1, t2, t3;
+
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
 
       load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-      reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
+      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      s += 7;
 
       do {
         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-        reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-        reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        s11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        s12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        s13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        s14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+        dd0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+        dd1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+        dd2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+        dd3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+        dd4 = convolve8_4x4(s4, s5, s6, s7, s8, s9, s10, s11, x_filter);
+        dd5 = convolve8_4x4(s5, s6, s7, s8, s9, s10, s11, s12, x_filter);
+        dd6 = convolve8_4x4(s6, s7, s8, s9, s10, s11, s12, s13, x_filter);
+        dd7 = convolve8_4x4(s7, s8, s9, s10, s11, s12, s13, s14, x_filter);
+
+        transpose_s16_4x8(&dd0, &dd1, &dd2, &dd3, &dd4, &dd5, &dd6, &dd7, &d0,
+                          &d1, &d2, &d3);
+
+        d0 = vaddq_s16(d0, horiz_const);
+        d1 = vaddq_s16(d1, horiz_const);
+        d2 = vaddq_s16(d2, horiz_const);
+        d3 = vaddq_s16(d3, horiz_const);
+
+        d0 = vqrshlq_s16(d0, shift_round_0);
+        d1 = vqrshlq_s16(d1, shift_round_0);
+        d2 = vqrshlq_s16(d2, shift_round_0);
+        d3 = vqrshlq_s16(d3, shift_round_0);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
-                           x_filter_tmp);
-
-        d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
-                           x_filter_tmp);
-
-        d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
-                           x_filter_tmp);
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
 
-        d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
-                           x_filter_tmp);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+    }
 
-        d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
-                           x_filter_tmp);
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+                                 height, x_filter, horiz_const, shift_round_0);
+    }
 
-        d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
-                           x_filter_tmp);
+#else   // !defined(__aarch64__)
+    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+                               height, x_filter, horiz_const, shift_round_0);
+#endif  // defined(__aarch64__)
+  }
+}
 
-        d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
-                           x_filter_tmp);
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
-        d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
-                           x_filter_tmp);
+static INLINE void av1_convolve_2d_sr_vert_8tap_neon(
+    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
+    int h, const int16x8_t y_filter, ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
-        transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
-                          &out2, &out3);
+  const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
 
-        out0 = vaddq_s16(out0, horiz_const);
-        out0 = vqrshlq_s16(out0, shift_round_0);
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
 
-        out1 = vaddq_s16(out1, horiz_const);
-        out1 = vqrshlq_s16(out1, shift_round_0);
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t dd0;
+    uint8x8_t d01;
 
-        out2 = vaddq_s16(out2, horiz_const);
-        out2 = vqrshlq_s16(out2, shift_round_0);
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t dd1;
+    uint8x8_t d23;
+#endif  // defined(__aarch64__)
 
-        out3 = vaddq_s16(out3, horiz_const);
-        out3 = vqrshlq_s16(out3, shift_round_0);
+    int16_t *s = src_ptr;
+    uint8_t *d = dst_ptr;
 
-        store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+    load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    s += (7 * src_stride);
 
-        reg0 = reg8;
-        reg1 = reg9;
-        reg2 = reg10;
-        reg3 = reg11;
-        reg4 = reg12;
-        reg5 = reg13;
-        reg6 = reg14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * im_dst_stride;
-      height -= 4;
-    }
+    do {
+#if defined(__aarch64__)
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+      s += (4 * src_stride);
 
-    if (height) {
-      assert(height < 4);
-      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
-                                 height, x_filter_tmp, horiz_const,
-                                 shift_round_0);
-    }
-#else
+      d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+      d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+      d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+      d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
 
-    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
-                               height, x_filter_tmp, horiz_const,
-                               shift_round_0);
-#endif
-  }
+      dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
+      dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
 
-  // vertical
-  {
-    uint8_t *dst_u8_ptr, *d_u8;
-    int16_t *v_src_ptr, *v_s;
+      d01 = vqmovun_s16(dd0);
+      d23 = vqmovun_s16(dd1);
 
-    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
-                              (1 << (offset_bits - conv_params->round_1 - 1));
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
+        d += dst_stride;
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 1);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 0);
+          d += dst_stride;
+          vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 1);
+          d += dst_stride;
+        }
+      } else {
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d01), 0);
+        d += dst_stride;
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d01), 2);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d23), 0);
+          d += dst_stride;
+          vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d23), 2);
+          d += dst_stride;
+        }
+      }
 
-    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
-    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+#else   // !defined(__aarch64__)
+      s7 = vld1_s16(s);
+      s += src_stride;
 
-    src_stride = im_stride;
-    v_src_ptr = im_block;
-    dst_u8_ptr = dst;
+      d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
 
-    height = h;
-    width = w;
+      dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
+      d01 = vqmovun_s16(dd0);
 
-    if (width <= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      uint16x4_t d0;
-      uint16x8_t dd0;
-      uint8x8_t d01;
+      if (w == 2) {
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d01), 0);
+        d += dst_stride;
+      } else {
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
+        d += dst_stride;
+      }
 
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      h--;
+#endif  // defined(__aarch64__)
+    } while (h > 0);
+  } else {
+    // if width is a multiple of 8 & height is a multiple of 4
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x8_t d0;
 #if defined(__aarch64__)
-      int16x4_t s8, s9, s10;
-      uint16x4_t d1, d2, d3;
-      uint16x8_t dd1;
-      uint8x8_t d23;
-#endif
-
-      d_u8 = dst_u8_ptr;
-      v_s = v_src_ptr;
+    int16x8_t s8, s9, s10;
+    uint8x8_t d1, d2, d3;
+#endif  // defined(__aarch64__)
 
-      __builtin_prefetch(v_s + 0 * im_stride);
-      __builtin_prefetch(v_s + 1 * im_stride);
-      __builtin_prefetch(v_s + 2 * im_stride);
-      __builtin_prefetch(v_s + 3 * im_stride);
-      __builtin_prefetch(v_s + 4 * im_stride);
-      __builtin_prefetch(v_s + 5 * im_stride);
-      __builtin_prefetch(v_s + 6 * im_stride);
-      __builtin_prefetch(v_s + 7 * im_stride);
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
 
-      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-      v_s += (7 * im_stride);
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      s += (7 * src_stride);
 
       do {
 #if defined(__aarch64__)
-        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
-        v_s += (im_stride << 2);
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+        s += (4 * src_stride);
 
-        __builtin_prefetch(d_u8 + 0 * dst_stride);
-        __builtin_prefetch(d_u8 + 1 * dst_stride);
-        __builtin_prefetch(d_u8 + 2 * dst_stride);
-        __builtin_prefetch(d_u8 + 3 * dst_stride);
-
-        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+        d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
                                     round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    sub_const_vec, vec_round_bits);
+        d1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
                                     round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    sub_const_vec, vec_round_bits);
+        d2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
                                     round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    sub_const_vec, vec_round_bits);
+        d3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
                                     round_shift_vec, offset_const,
-                                    sub_const_vec);
-
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
-        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
-
-        d01 = vqmovn_u16(dd0);
-        d23 = vqmovn_u16(dd1);
+                                    sub_const_vec, vec_round_bits);
 
-        if ((w == 4) && (h != 2)) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        1);  // 10 11 12 13
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
-                        0);  // 20 21 22 23
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
-                        1);  // 30 31 32 33
-          d_u8 += dst_stride;
-        } else if ((w == 2) && (h != 2)) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        2);  // 10 11
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
-                        0);  // 20 21
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
-                        2);  // 30 31
-          d_u8 += dst_stride;
-        } else if ((w == 4) && (h == 2)) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        1);  // 10 11 12 13
-          d_u8 += dst_stride;
-        } else if ((w == 2) && (h == 2)) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        2);  // 10 11
-          d_u8 += dst_stride;
+        vst1_u8(d, d0);
+        d += dst_stride;
+        vst1_u8(d, d1);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_u8(d, d2);
+          d += dst_stride;
+          vst1_u8(d, d3);
+          d += dst_stride;
         }
 
         s0 = s4;
@@ -1418,29 +2946,16 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s5 = s9;
         s6 = s10;
         height -= 4;
-#else
-        s7 = vld1_s16(v_s);
-        v_s += im_stride;
-
-        __builtin_prefetch(d_u8 + 0 * dst_stride);
+#else   // !defined(__aarch64__)
+        s7 = vld1q_s16(s);
+        s += src_stride;
 
-        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+        d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
                                     round_shift_vec, offset_const,
-                                    sub_const_vec);
-
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
-        d01 = vqmovn_u16(dd0);
-
-        if (w == 4) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
+                                    sub_const_vec, vec_round_bits);
 
-        } else if (w == 2) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-        }
+        vst1_u8(d, d0);
+        d += dst_stride;
 
         s0 = s1;
         s1 = s2;
@@ -1449,108 +2964,325 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s4 = s5;
         s5 = s6;
         s6 = s7;
-        height -= 1;
-#endif
+        height--;
+#endif  // defined(__aarch64__)
       } while (height > 0);
-    } else {
-      // if width is a multiple of 8 & height is a multiple of 4
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-      uint8x8_t res0;
+
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+static INLINE int16x4_t convolve6_vert_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t y_filter, const int32x4_t round_shift_vec,
+    const int32x4_t offset_const, const int32x4_t sub_const_vec) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+  int32x4_t sum;
+
+  sum = vmull_lane_s16(s0, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
+
+  sum = vaddq_s32(sum, offset_const);
+  sum = vqrshlq_s32(sum, round_shift_vec);
+  sum = vsubq_s32(sum, sub_const_vec);
+
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve6_vert_8x4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t y_filter, const int32x4_t round_shift_vec,
+    const int32x4_t offset_const, const int32x4_t sub_const_vec,
+    const int16x8_t vec_round_bits) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+  int32x4_t sum0, sum1;
+  int16x8_t res;
+
+  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
+
+  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum1 = vaddq_s32(sum1, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum1 = vsubq_s32(sum1, sub_const_vec);
+
+  res = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  res = vqrshlq_s16(res, vec_round_bits);
+
+  return vqmovun_s16(res);
+}
+
+static INLINE void av1_convolve_2d_sr_vert_6tap_neon(
+    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
+    int h, const int16x8_t y_filter, ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, d0;
+    int16x8_t dd0;
+    uint8x8_t d01;
+
 #if defined(__aarch64__)
-      int16x8_t s8, s9, s10;
-      uint8x8_t res1, res2, res3;
-#endif
+    int16x4_t s6, s7, s8, d1, d2, d3;
+    int16x8_t dd1;
+    uint8x8_t d23;
+#endif  // defined(__aarch64__)
 
-      do {
-        __builtin_prefetch(v_src_ptr + 0 * im_stride);
-        __builtin_prefetch(v_src_ptr + 1 * im_stride);
-        __builtin_prefetch(v_src_ptr + 2 * im_stride);
-        __builtin_prefetch(v_src_ptr + 3 * im_stride);
-        __builtin_prefetch(v_src_ptr + 4 * im_stride);
-        __builtin_prefetch(v_src_ptr + 5 * im_stride);
-        __builtin_prefetch(v_src_ptr + 6 * im_stride);
-        __builtin_prefetch(v_src_ptr + 7 * im_stride);
-
-        v_s = v_src_ptr;
-        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-        v_s += (7 * im_stride);
-
-        d_u8 = dst_u8_ptr;
-        height = h;
+    int16_t *s = src_ptr;
+    uint8_t *d = dst_ptr;
 
-        do {
+    s0 = vld1_s16(s + 0 * src_stride);
+    s1 = vld1_s16(s + 1 * src_stride);
+    s2 = vld1_s16(s + 2 * src_stride);
+    s3 = vld1_s16(s + 3 * src_stride);
+    s4 = vld1_s16(s + 4 * src_stride);
+    s += (5 * src_stride);
+
+    do {
 #if defined(__aarch64__)
-          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
-          v_s += (im_stride << 2);
-
-          __builtin_prefetch(d_u8 + 4 * dst_stride);
-          __builtin_prefetch(d_u8 + 5 * dst_stride);
-          __builtin_prefetch(d_u8 + 6 * dst_stride);
-          __builtin_prefetch(d_u8 + 7 * dst_stride);
-
-          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
+      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+      s += (4 * src_stride);
 
-          if (h != 2) {
-            vst1_u8(d_u8, res0);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res1);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res2);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res3);
-            d_u8 += dst_stride;
-          } else {
-            vst1_u8(d_u8, res0);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res1);
-            d_u8 += dst_stride;
-          }
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-#else
-          s7 = vld1q_s16(v_s);
-          v_s += im_stride;
-
-          __builtin_prefetch(d_u8 + 0 * dst_stride);
-
-          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-
-          vst1_u8(d_u8, res0);
-          d_u8 += dst_stride;
-
-          s0 = s1;
-          s1 = s2;
-          s2 = s3;
-          s3 = s4;
-          s4 = s5;
-          s5 = s6;
-          s6 = s7;
-          height -= 1;
-#endif
-        } while (height > 0);
-        v_src_ptr += 8;
-        dst_u8_ptr += 8;
-        w -= 8;
-      } while (w > 0);
+      d0 = convolve6_vert_4x4_s32(s0, s1, s2, s3, s4, s5, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+      d1 = convolve6_vert_4x4_s32(s1, s2, s3, s4, s5, s6, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+      d2 = convolve6_vert_4x4_s32(s2, s3, s4, s5, s6, s7, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+      d3 = convolve6_vert_4x4_s32(s3, s4, s5, s6, s7, s8, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+
+      dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
+      dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
+
+      d01 = vqmovun_s16(dd0);
+      d23 = vqmovun_s16(dd1);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
+        d += dst_stride;
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 1);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 0);
+          d += dst_stride;
+          vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 1);
+          d += dst_stride;
+        }
+      } else {
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d01), 0);
+        d += dst_stride;
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d01), 2);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d23), 0);
+          d += dst_stride;
+          vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d23), 2);
+          d += dst_stride;
+        }
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      h -= 4;
+#else   // !defined(__aarch64__)
+      s5 = vld1_s16(s);
+      s += src_stride;
+
+      d0 = convolve6_vert_4x4_s32(s0, s1, s2, s3, s4, s5, y_filter,
+                                  round_shift_vec, offset_const, sub_const_vec);
+
+      dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
+      d01 = vqmovun_s16(dd0);
+
+      if (w == 2) {
+        vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(d01), 0);
+        d += dst_stride;
+      } else {
+        vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
+        d += dst_stride;
+      }
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      h--;
+#endif  // defined(__aarch64__)
+    } while (h > 0);
+  } else {
+    // if width is a multiple of 8 & height is a multiple of 4
+    int16x8_t s0, s1, s2, s3, s4, s5;
+    uint8x8_t d0;
+#if defined(__aarch64__)
+    int16x8_t s6, s7, s8;
+    uint8x8_t d1, d2, d3;
+#endif  // defined(__aarch64__)
+
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      s0 = vld1q_s16(s + 0 * src_stride);
+      s1 = vld1q_s16(s + 1 * src_stride);
+      s2 = vld1q_s16(s + 2 * src_stride);
+      s3 = vld1q_s16(s + 3 * src_stride);
+      s4 = vld1q_s16(s + 4 * src_stride);
+      s += (5 * src_stride);
+
+      do {
+#if defined(__aarch64__)
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+        s += (4 * src_stride);
+
+        d0 = convolve6_vert_8x4_s32(s0, s1, s2, s3, s4, s5, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec, vec_round_bits);
+        d1 = convolve6_vert_8x4_s32(s1, s2, s3, s4, s5, s6, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec, vec_round_bits);
+        d2 = convolve6_vert_8x4_s32(s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec, vec_round_bits);
+        d3 = convolve6_vert_8x4_s32(s3, s4, s5, s6, s7, s8, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec, vec_round_bits);
+
+        vst1_u8(d, d0);
+        d += dst_stride;
+        vst1_u8(d, d1);
+        d += dst_stride;
+        if (h != 2) {
+          vst1_u8(d, d2);
+          d += dst_stride;
+          vst1_u8(d, d3);
+          d += dst_stride;
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        height -= 4;
+#else   // !defined(__aarch64__)
+        s5 = vld1q_s16(s);
+        s += src_stride;
+
+        d0 = convolve6_vert_8x4_s32(s0, s1, s2, s3, s4, s5, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec, vec_round_bits);
+
+        vst1_u8(d, d0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        height--;
+#endif  // defined(__aarch64__)
+      } while (height > 0);
+
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  if (filter_params_x->taps > 8) {
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+    const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+    av1_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
+                                        im_stride, w, im_h, x_filter_0_7,
+                                        x_filter_8_11, conv_params->round_0);
+
+    av1_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w,
+                                       h, y_filter_0_7, y_filter_8_11,
+                                       conv_params);
+  } else {
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+    av1_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w,
+                                  im_h, x_filter, conv_params->round_0);
+
+    if (clamped_y_taps <= 6) {
+      av1_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w,
+                                        h, y_filter, conv_params);
+    } else {
+      av1_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w,
+                                        h, y_filter, conv_params);
     }
   }
 }
@@ -1574,8 +3306,6 @@ static INLINE void scaledconvolve_horiz_w4(
         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
         if (x_q4 & SUBPEL_MASK) {
           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
           uint8x8_t s[8], d;
           int16x8_t ss[4];
           int16x4_t t[8], tt;
@@ -1597,7 +3327,7 @@ static INLINE void scaledconvolve_horiz_w4(
           t[7] = vget_high_s16(ss[3]);
 
           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters, filter3, filter4);
+                           filters);
           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
         } else {
@@ -1703,8 +3433,6 @@ static INLINE void scaledconvolve_vert_w4(
 
     if (y_q4 & SUBPEL_MASK) {
       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
       uint8x8_t s[8], d;
       int16x4_t t[8], tt;
 
@@ -1719,8 +3447,7 @@ static INLINE void scaledconvolve_vert_w4(
       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
 
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
-                       filter3, filter4);
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
     } else {
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 27a996ce9..4e9f63621 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -19,21 +19,19 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters,
-                                    const int16x4_t filter3,
-                                    const int16x4_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
 
-  sum = vmul_lane_s16(s0, filters_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
   return sum;
 }
 
@@ -41,28 +39,24 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filters,
-                                    const int16x8_t filter3,
-                                    const int16x8_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
 
-  sum = vmulq_lane_s16(s0, filters_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
   return vqrshrun_n_s16(sum, 7);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
-                                       const int16x8_t filters) {
-  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+                                       const int16x8_t filter) {
   int16x8_t ss[8];
 
   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -75,7 +69,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
 
   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filters, filter3, filter4);
+                     filter);
 }
 
 static INLINE uint8x8_t wiener_convolve8_vert_4x8(
@@ -85,28 +79,27 @@ static INLINE uint8x8_t wiener_convolve8_vert_4x8(
     const int round1_bits) {
   int16x8_t ss0, ss1, ss2;
   int32x4_t sum0, sum1;
-  uint16x4_t tmp0, tmp1;
-  uint16x8_t tmp;
+  int16x8_t tmp;
   uint8x8_t res;
 
   const int32_t round_const = (1 << (bd + round1_bits - 1));
   const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
-  const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec = vdupq_n_s32(round_const);
+  const int16x4_t filter = vld1_s16(filter_y);
 
   ss0 = vaddq_s16(s0, s6);
   ss1 = vaddq_s16(s1, s5);
   ss2 = vaddq_s16(s2, s4);
 
-  sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+  sum0 = vmull_lane_s16(vget_low_s16(ss0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
 
-  sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+  sum1 = vmull_lane_s16(vget_high_s16(ss0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
 
   sum0 = vsubq_s32(sum0, round_vec);
   sum1 = vsubq_s32(sum1, round_vec);
@@ -115,14 +108,9 @@ static INLINE uint8x8_t wiener_convolve8_vert_4x8(
   sum0 = vrshlq_s32(sum0, round_bits);
   sum1 = vrshlq_s32(sum1, round_bits);
 
-  sum0 = vmaxq_s32(sum0, zero);
-  sum1 = vmaxq_s32(sum1, zero);
-
   /* from int32x4_t to uint8x8_t */
-  tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
-  tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
-  tmp = vcombine_u16(tmp0, tmp1);
-  res = vqmovn_u16(tmp);
+  tmp = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  res = vqmovun_s16(tmp);
 
   return res;
 }
@@ -143,10 +131,11 @@ static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
 
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
   const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+  const int16x4_t filter = vld1_s16(filter_x);
 
-  sum = vmulq_n_s16(s0, filter_x[0]);
-  sum = vmlaq_n_s16(sum, s1, filter_x[1]);
-  sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+  sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
 
   /* sum from 16x8 to 2 32x4 registers */
   sum_0 = vmovl_s16(vget_low_s16(sum));
@@ -156,8 +145,8 @@ static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
    *  then max value possible = 128*128*255 exceeding 16 bit
    */
 
-  s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
-  s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+  s3_0 = vmull_lane_s16(vget_low_s16(s3), filter, 3);
+  s3_1 = vmull_lane_s16(vget_high_s16(s3), filter, 3);
   sum_0 = vaddq_s32(sum_0, s3_0);
   sum_1 = vaddq_s32(sum_1, s3_1);
 
@@ -189,103 +178,315 @@ static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
   const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
   const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
   const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
-  const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
   const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+  const int16x4_t filter = vld1_s16(filter_x);
 
   temp0 = vadd_s16(s0, s6);
   temp1 = vadd_s16(s1, s5);
   temp2 = vadd_s16(s2, s4);
 
-  sum = vmul_n_s16(temp0, filter_x[0]);
-  sum = vmla_n_s16(sum, temp1, filter_x[1]);
-  sum = vmla_n_s16(sum, temp2, filter_x[2]);
+  sum = vmul_lane_s16(temp0, filter, 0);
+  sum = vmla_lane_s16(sum, temp1, filter, 1);
+  sum = vmla_lane_s16(sum, temp2, filter, 2);
   sum_0 = vmovl_s16(sum);
 
   /* s[3]*128 -- and filter coff max can be 128.
    * then max value possible = 128*128*255 Therefore, 32 bits are required to
    * hold the result.
    */
-  s3_0 = vmull_n_s16(s3, filter_x[3]);
+  s3_0 = vmull_lane_s16(s3, filter, 3);
   sum_0 = vaddq_s32(sum_0, s3_0);
 
   sum_0 = vaddq_s32(sum_0, round_vec_0);
   sum_0 = vrshlq_s32(sum_0, round_bits);
 
-  sum_0 = vmaxq_s32(sum_0, zero);
   sum_0 = vminq_s32(sum_0, round_vec_1);
   res = vqmovun_s32(sum_0);
   return res;
 }
 
-static INLINE int16x8_t
-convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                  const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
-                  const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+static INLINE int16x8_t convolve8_8x8_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
-  int16x8_t res;
 
   sum = horiz_const;
-  sum = vmlaq_n_s16(sum, s0, filter[0]);
-  sum = vmlaq_n_s16(sum, s1, filter[1]);
-  sum = vmlaq_n_s16(sum, s2, filter[2]);
-  sum = vmlaq_n_s16(sum, s3, filter[3]);
-  sum = vmlaq_n_s16(sum, s4, filter[4]);
-  sum = vmlaq_n_s16(sum, s5, filter[5]);
-  sum = vmlaq_n_s16(sum, s6, filter[6]);
-  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
 
-  res = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_round_0);
 
-  return res;
+  return sum;
 }
 
-static INLINE int16x4_t
-convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                  const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
-                  const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x2_t permute_tbl,
+                                          const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* First 4 output values. */
+  sum = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE int16x8_t convolve8_8_usdot(uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          const int32x4_t horiz_const,
+                                          const int16x8_t shift_round_0) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  return vqrshlq_s16(sum, shift_round_0);
+}
+
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE int16x8_t convolve8_8_sdot(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl,
+                                         const int16x8_t shift_round_0) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  return vqrshlq_s16(sum, shift_round_0);
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE int16x4_t convolve8_4x4_s16(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
+
   sum = horiz_const;
-  sum = vmla_n_s16(sum, s0, filter[0]);
-  sum = vmla_n_s16(sum, s1, filter[1]);
-  sum = vmla_n_s16(sum, s2, filter[2]);
-  sum = vmla_n_s16(sum, s3, filter[3]);
-  sum = vmla_n_s16(sum, s4, filter[4]);
-  sum = vmla_n_s16(sum, s5, filter[5]);
-  sum = vmla_n_s16(sum, s6, filter[6]);
-  sum = vmla_n_s16(sum, s7, filter[7]);
+  sum = vmla_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
 
   sum = vqrshl_s16(sum, shift_round_0);
 
   return sum;
 }
 
-static INLINE uint16x4_t convolve8_4x4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
-    const int32x4_t round_shift_vec, const int32x4_t offset_const) {
-  int32x4_t sum0;
-  uint16x4_t res;
-  const int32x4_t zero = vdupq_n_s32(0);
-
-  sum0 = vmull_n_s16(s0, y_filter[0]);
-  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
-  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
-  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
-  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
-  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
-  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
-  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
-
-  sum0 = vaddq_s32(sum0, offset_const);
+static INLINE uint16x4_t convolve6_4_s32(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t s4, const int16x4_t s5,
+                                         const int16x8_t y_filter,
+                                         const int32x4_t round_shift_vec,
+                                         const int32x4_t offset_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
+
+  sum = vqrshlq_s32(sum, round_shift_vec);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t convolve6_8_s32(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t y_filter,
+                                         const int32x4_t round_shift_vec,
+                                         const int32x4_t offset_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
+
   sum0 = vqrshlq_s32(sum0, round_shift_vec);
-  sum0 = vmaxq_s32(sum0, zero);
-  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
 
-  return res;
+static INLINE uint16x4_t convolve8_4_s32(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t s4, const int16x4_t s5,
+                                         const int16x4_t s6, const int16x4_t s7,
+                                         const int16x8_t y_filter,
+                                         const int32x4_t round_shift_vec,
+                                         const int32x4_t offset_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+  sum = vqrshlq_s32(sum, round_shift_vec);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t convolve8_8_s32(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t s6, const int16x8_t s7,
+                                         const int16x8_t y_filter,
+                                         const int32x4_t round_shift_vec,
+                                         const int32x4_t offset_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
 }
 
 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index e0b76a87b..36c8f9cb8 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -45,7 +45,7 @@ static INLINE void compute_avg_4x1(
 
     dst0 = vqrshlq_s32(dst0, round_bits_vec);
 
-    tmp0 = vqmovn_s32(dst0);
+    tmp0 = vmovn_s32(dst0);
     tmp4 = vcombine_s16(tmp0, tmp0);
 
     *t0 = vqmovun_s16(tmp4);
@@ -57,7 +57,7 @@ static INLINE void compute_avg_4x1(
 
     tmp0 = vqrshl_s16(tmp0, round_bits_vec);
 
-    tmp4 = vcombine_s16(tmp0, tmp0);
+    tmp4 = vcombine_s16(tmp0, vdup_n_s16(0));
 
     *t0 = vqmovun_s16(tmp4);
   }
@@ -67,7 +67,6 @@ static INLINE void compute_avg_8x1(
     uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
     const uint16_t bck_offset, const int16x4_t sub_const,
     const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
-  int16x4_t tmp0, tmp2;
   int16x8_t f0;
   uint32x4_t sum0, sum2;
   int32x4_t dst0, dst2;
@@ -92,10 +91,7 @@ static INLINE void compute_avg_8x1(
     dst0 = vqrshlq_s32(dst0, round_bits_vec);
     dst2 = vqrshlq_s32(dst2, round_bits_vec);
 
-    tmp0 = vqmovn_s32(dst0);
-    tmp2 = vqmovn_s32(dst2);
-
-    f0 = vcombine_s16(tmp0, tmp2);
+    f0 = vcombine_s16(vmovn_s32(dst0), vmovn_s32(dst2));
 
     *t0 = vqmovun_s16(f0);
 
@@ -126,7 +122,6 @@ static INLINE void compute_avg_4x4(
 
   int32x4_t dst0, dst1, dst2, dst3;
   int16x8_t tmp4, tmp5;
-  const int16x8_t zero = vdupq_n_s16(0);
 
   if (use_dist_wtd_comp_avg) {
     const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
@@ -156,17 +151,11 @@ static INLINE void compute_avg_4x4(
     dst2 = vqrshlq_s32(dst2, round_bits_vec);
     dst3 = vqrshlq_s32(dst3, round_bits_vec);
 
-    tmp0 = vqmovn_s32(dst0);
-    tmp1 = vqmovn_s32(dst1);
-    tmp2 = vqmovn_s32(dst2);
-    tmp3 = vqmovn_s32(dst3);
-    tmp4 = vcombine_s16(tmp0, tmp1);
-    tmp5 = vcombine_s16(tmp2, tmp3);
-    tmp4 = vmaxq_s16(tmp4, zero);
-    tmp5 = vmaxq_s16(tmp5, zero);
+    tmp4 = vcombine_s16(vmovn_s32(dst0), vmovn_s32(dst1));
+    tmp5 = vcombine_s16(vmovn_s32(dst2), vmovn_s32(dst3));
 
-    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
-    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+    *t0 = vqmovun_s16(tmp4);
+    *t1 = vqmovun_s16(tmp5);
   } else {
     const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
     tmp_u0 = vhadd_u16(res0, d0);
@@ -186,11 +175,9 @@ static INLINE void compute_avg_4x4(
 
     tmp4 = vcombine_s16(tmp0, tmp1);
     tmp5 = vcombine_s16(tmp2, tmp3);
-    tmp4 = vmaxq_s16(tmp4, zero);
-    tmp5 = vmaxq_s16(tmp5, zero);
 
-    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
-    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+    *t0 = vqmovun_s16(tmp4);
+    *t1 = vqmovun_s16(tmp5);
   }
 }
 
@@ -201,14 +188,12 @@ static INLINE void compute_avg_8x4(
     const int16x4_t sub_const, const int16_t round_bits,
     const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
     uint8x8_t *t2, uint8x8_t *t3) {
-  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int16x8_t f0, f1, f2, f3;
   uint32x4_t sum0, sum1, sum2, sum3;
   uint32x4_t sum4, sum5, sum6, sum7;
   int32x4_t dst0, dst1, dst2, dst3;
   int32x4_t dst4, dst5, dst6, dst7;
   uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
-  const int16x8_t zero = vdupq_n_s16(0);
 
   if (use_dist_wtd_comp_avg) {
     const int32x4_t sub_const_vec = vmovl_s16(sub_const);
@@ -260,29 +245,15 @@ static INLINE void compute_avg_8x4(
     dst6 = vqrshlq_s32(dst6, round_bits_vec);
     dst7 = vqrshlq_s32(dst7, round_bits_vec);
 
-    tmp0 = vqmovn_s32(dst0);
-    tmp1 = vqmovn_s32(dst1);
-    tmp2 = vqmovn_s32(dst2);
-    tmp3 = vqmovn_s32(dst3);
-    tmp4 = vqmovn_s32(dst4);
-    tmp5 = vqmovn_s32(dst5);
-    tmp6 = vqmovn_s32(dst6);
-    tmp7 = vqmovn_s32(dst7);
-
-    f0 = vcombine_s16(tmp0, tmp2);
-    f1 = vcombine_s16(tmp1, tmp3);
-    f2 = vcombine_s16(tmp4, tmp6);
-    f3 = vcombine_s16(tmp5, tmp7);
-
-    f0 = vmaxq_s16(f0, zero);
-    f1 = vmaxq_s16(f1, zero);
-    f2 = vmaxq_s16(f2, zero);
-    f3 = vmaxq_s16(f3, zero);
-
-    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
-    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
-    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
-    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+    f0 = vcombine_s16(vmovn_s32(dst0), vmovn_s32(dst2));
+    f1 = vcombine_s16(vmovn_s32(dst1), vmovn_s32(dst3));
+    f2 = vcombine_s16(vmovn_s32(dst4), vmovn_s32(dst6));
+    f3 = vcombine_s16(vmovn_s32(dst5), vmovn_s32(dst7));
+
+    *t0 = vqmovun_s16(f0);
+    *t1 = vqmovun_s16(f1);
+    *t2 = vqmovun_s16(f2);
+    *t3 = vqmovun_s16(f3);
 
   } else {
     const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
@@ -303,21 +274,205 @@ static INLINE void compute_avg_8x4(
     f2 = vqrshlq_s16(f2, round_bits_vec);
     f3 = vqrshlq_s16(f3, round_bits_vec);
 
-    f0 = vmaxq_s16(f0, zero);
-    f1 = vmaxq_s16(f1, zero);
-    f2 = vmaxq_s16(f2, zero);
-    f3 = vmaxq_s16(f3, zero);
+    *t0 = vqmovun_s16(f0);
+    *t1 = vqmovun_s16(f1);
+    *t2 = vqmovun_s16(f2);
+    *t3 = vqmovun_s16(f3);
+  }
+}
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16x8_t x_filter_s16, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int width = w;
+  int height = im_h;
+
+  const int8x8_t x_filter = vmovn_s16(x_filter_s16);
+  const int32x4_t horiz_const = vdupq_n_s32(1 << (bd + FILTER_BITS - 2));
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      s0 = vld1q_u8(src + 0 * src_stride);
+      s1 = vld1q_u8(src + 1 * src_stride);
+      s2 = vld1q_u8(src + 2 * src_stride);
+      s3 = vld1q_u8(src + 3 * src_stride);
+
+      t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
+      t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, horiz_const);
+      t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
+      t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+    const uint8_t *s;
+    int16_t *d;
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst_ptr;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_usdot(s0, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+        d1 = convolve8_8_usdot(s1, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+        d2 = convolve8_8_usdot(s2, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+        d3 = convolve8_8_usdot(s3, x_filter, permute_tbl, horiz_const,
+                               shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16x8_t x_filter_s16, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int width = w;
+  int height = im_h;
+
+  const int8x8_t x_filter = vmovn_s16(x_filter_s16);
+  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+  // Dot product constants.
+  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 7);
+  const int32x4_t correction =
+      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      s0 = vld1q_u8(src + 0 * src_stride);
+      s1 = vld1q_u8(src + 1 * src_stride);
+      s2 = vld1q_u8(src + 2 * src_stride);
+      s3 = vld1q_u8(src + 3 * src_stride);
+
+      t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+    const uint8_t *s;
+    int16_t *d;
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst_ptr;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d1 = convolve8_8_sdot(s1, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d2 = convolve8_8_sdot(s2, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d3 = convolve8_8_sdot(s3, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
 
-    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
-    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
-    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
-    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
   }
 }
 
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
 static INLINE void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
-    int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+    const int16x8_t x_filter, const int im_h, int w, const int round_0) {
   const int bd = 8;
   const uint8_t *s;
   int16_t *dst_ptr;
@@ -380,13 +535,13 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon(
       s9 = vget_low_s16(tt2);
       s10 = vget_low_s16(tt3);
 
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                              horiz_const, shift_round_0);
-      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                              horiz_const, shift_round_0);
-      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                              horiz_const, shift_round_0);
-      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                              horiz_const, shift_round_0);
 
       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
@@ -418,7 +573,7 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon(
       s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
       s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
 
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                              horiz_const, shift_round_0);
 
       vst1_s16(dst_ptr, d0);
@@ -483,22 +638,22 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon(
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                                  horiz_const, shift_round_0);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                                  horiz_const, shift_round_0);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                                  horiz_const, shift_round_0);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
                                  horiz_const, shift_round_0);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                 horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                 horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
                                  horiz_const, shift_round_0);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, horiz_const, shift_round_0);
 
         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
                           &res7);
@@ -543,8 +698,8 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon(
         s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
         s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
-                                 x_filter_tmp, horiz_const, shift_round_0);
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 horiz_const, shift_round_0);
         vst1q_s16(d_tmp, res0);
 
         s += 8;
@@ -559,17 +714,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_neon(
-    int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
-    ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
-  uint8_t *dst_u8_ptr, *d_u8;
-  CONV_BUF_TYPE *dst_ptr, *dst;
-  int16_t *src_ptr, *s;
-  uint16_t *d;
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
+    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
 
   const int bd = 8;
-  int height;
-  int dst_stride = conv_params->dst_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
                             (1 << (offset_bits - conv_params->round_1 - 1));
@@ -585,85 +738,268 @@ static INLINE void dist_wtd_convolve_2d_vert_neon(
   const int do_average = conv_params->do_average;
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
-  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-  uint16x4_t res4, d0;
-  uint8x8_t t0;
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5;
+    uint16x4_t dd0, d0;
+    uint8x8_t d01_u8;
 
 #if defined(__aarch64__)
-  int16x4_t s8, s9, s10;
-  uint16x4_t res5, res6, res7, d1, d2, d3;
-  uint8x8_t t1;
+    int16x4_t s6, s7, s8;
+    uint16x4_t dd1, dd2, dd3, d1, d2, d3;
+    uint8x8_t d23_u8;
 #endif
 
-  dst = conv_params->dst;
-  src_ptr = im_block;
-  dst_u8_ptr = dst8;
-  dst_ptr = dst;
-  height = h;
+    s0 = vld1_s16(src_ptr + 0 * src_stride);
+    s1 = vld1_s16(src_ptr + 1 * src_stride);
+    s2 = vld1_s16(src_ptr + 2 * src_stride);
+    s3 = vld1_s16(src_ptr + 3 * src_stride);
+    s4 = vld1_s16(src_ptr + 4 * src_stride);
+    src_ptr += 5 * src_stride;
 
-  do {
-    d = dst_ptr;
-    d_u8 = dst_u8_ptr;
-    s = src_ptr;
-    height = h;
+    do {
+#if defined(__aarch64__)
+      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
 
-    __builtin_prefetch(s + 0 * im_stride);
-    __builtin_prefetch(s + 1 * im_stride);
-    __builtin_prefetch(s + 2 * im_stride);
-    __builtin_prefetch(s + 3 * im_stride);
-    __builtin_prefetch(s + 4 * im_stride);
-    __builtin_prefetch(s + 5 * im_stride);
-    __builtin_prefetch(s + 6 * im_stride);
-    __builtin_prefetch(s + 7 * im_stride);
+      d0 = convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, round_shift_vec,
+                           offset_const);
+      d1 = convolve6_4_s32(s1, s2, s3, s4, s5, s6, y_filter, round_shift_vec,
+                           offset_const);
+      d2 = convolve6_4_s32(s2, s3, s4, s5, s6, s7, y_filter, round_shift_vec,
+                           offset_const);
+      d3 = convolve6_4_s32(s3, s4, s5, s6, s7, s8, y_filter, round_shift_vec,
+                           offset_const);
 
-    load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-    s += (7 * im_stride);
+      if (do_average) {
+        load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        compute_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                        bck_offset, sub_const_vec, round_bits,
+                        use_dist_wtd_comp_avg, &d01_u8, &d23_u8);
+
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d01_u8), 0);
+        dst8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d01_u8), 1);
+        dst8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d23_u8), 0);
+        dst8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d23_u8), 1);
+        dst8_ptr += dst8_stride;
+      } else {
+        store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else
+      s5 = vld1_s16(src_ptr);
+
+      d0 = convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, round_shift_vec,
+                           offset_const);
+
+      if (do_average) {
+        dd0 = vld1_u16(dst_ptr);
+
+        compute_avg_4x1(dd0, d0, fwd_offset, bck_offset, sub_const_vec,
+                        round_bits, use_dist_wtd_comp_avg, &d01_u8);
+
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d01_u8), 0);
+        dst8_ptr += dst8_stride;
+
+      } else {
+        vst1_u16(dst_ptr, d0);
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif
+    } while (h > 0);
+
+  } else {
+    int16x8_t s0, s1, s2, s3, s4, s5;
+    uint16x8_t dd0, d0;
+    uint8x8_t d0_u8;
+
+#if defined(__aarch64__)
+    int16x8_t s6, s7, s8;
+    uint16x8_t dd1, dd2, dd3, d1, d2, d3;
+    uint8x8_t d1_u8, d2_u8, d3_u8;
+#endif
 
     do {
+      int16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      s0 = vld1q_s16(s + 0 * src_stride);
+      s1 = vld1q_s16(s + 1 * src_stride);
+      s2 = vld1q_s16(s + 2 * src_stride);
+      s3 = vld1q_s16(s + 3 * src_stride);
+      s4 = vld1q_s16(s + 4 * src_stride);
+      s += 5 * src_stride;
+
+      do {
 #if defined(__aarch64__)
-      load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
-      s += (im_stride << 2);
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        d0 = convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, round_shift_vec,
+                             offset_const);
+        d1 = convolve6_8_s32(s1, s2, s3, s4, s5, s6, y_filter, round_shift_vec,
+                             offset_const);
+        d2 = convolve6_8_s32(s2, s3, s4, s5, s6, s7, y_filter, round_shift_vec,
+                             offset_const);
+        d3 = convolve6_8_s32(s3, s4, s5, s6, s7, s8, y_filter, round_shift_vec,
+                             offset_const);
+
+        if (do_average) {
+          load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+          compute_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                          bck_offset, sub_const_vec, round_bits,
+                          use_dist_wtd_comp_avg, &d0_u8, &d1_u8, &d2_u8,
+                          &d3_u8);
+
+          vst1_u8(d_u8, d0_u8);
+          d_u8 += dst8_stride;
+          vst1_u8(d_u8, d1_u8);
+          d_u8 += dst8_stride;
+          vst1_u8(d_u8, d2_u8);
+          d_u8 += dst8_stride;
+          vst1_u8(d_u8, d3_u8);
+          d_u8 += dst8_stride;
+        } else {
+          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+        }
 
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
 
-      __builtin_prefetch(d_u8 + 4 * dst8_stride);
-      __builtin_prefetch(d_u8 + 5 * dst8_stride);
-      __builtin_prefetch(d_u8 + 6 * dst8_stride);
-      __builtin_prefetch(d_u8 + 7 * dst8_stride);
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else
+        s5 = vld1q_s16(s);
 
-      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                             round_shift_vec, offset_const);
-      d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                             round_shift_vec, offset_const);
-      d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                             round_shift_vec, offset_const);
-      d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                             round_shift_vec, offset_const);
+        d0 = convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, round_shift_vec,
+                             offset_const);
 
-      if (do_average) {
-        load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
-        d += (dst_stride << 2);
+        if (do_average) {
+          dd0 = vld1q_u16(d);
 
-        compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
-                        bck_offset, sub_const_vec, round_bits,
-                        use_dist_wtd_comp_avg, &t0, &t1);
+          compute_avg_8x1(dd0, d0, fwd_offset, bck_offset, sub_const_vec,
+                          round_bits, use_dist_wtd_comp_avg, &d0_u8);
 
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
-        d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
-        d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
-        d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
-        d_u8 += dst8_stride;
+          vst1_u8(d_u8, d0_u8);
+          d_u8 += dst8_stride;
+
+        } else {
+          vst1q_u16(d, d0);
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif
+      } while (height > 0);
+
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
+    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+
+  const int16_t round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset);
+  const int16x4_t sub_const_vec = vdup_n_s16(sub_const);
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint16x4_t dd0, d0;
+    uint8x8_t d01_u8;
+
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10;
+    uint16x4_t dd1, dd2, dd3, d1, d2, d3;
+    uint8x8_t d23_u8;
+#endif
 
+    load_s16_4x8(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    src_ptr += 7 * src_stride;
+
+    do {
+#if defined(__aarch64__)
+      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      d0 = convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                           round_shift_vec, offset_const);
+      d1 = convolve8_4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                           round_shift_vec, offset_const);
+      d2 = convolve8_4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                           round_shift_vec, offset_const);
+      d3 = convolve8_4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                           round_shift_vec, offset_const);
+
+      if (do_average) {
+        load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        compute_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                        bck_offset, sub_const_vec, round_bits,
+                        use_dist_wtd_comp_avg, &d01_u8, &d23_u8);
+
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d01_u8), 0);
+        dst8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d01_u8), 1);
+        dst8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d23_u8), 0);
+        dst8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d23_u8), 1);
+        dst8_ptr += dst8_stride;
       } else {
-        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-        d += (dst_stride << 2);
+        store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
       }
+
       s0 = s4;
       s1 = s5;
       s2 = s6;
@@ -671,30 +1007,27 @@ static INLINE void dist_wtd_convolve_2d_vert_neon(
       s4 = s8;
       s5 = s9;
       s6 = s10;
-      height -= 4;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
 #else
-      s7 = vld1_s16(s);
-      s += (im_stride);
+      s7 = vld1_s16(src_ptr);
 
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d_u8 + 0 * dst8_stride);
-
-      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                             round_shift_vec, offset_const);
+      d0 = convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                           round_shift_vec, offset_const);
 
       if (do_average) {
-        res4 = vld1_u16(d);
-        d += (dst_stride);
+        dd0 = vld1_u16(dst_ptr);
 
-        compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
-                        round_bits, use_dist_wtd_comp_avg, &t0);
+        compute_avg_4x1(dd0, d0, fwd_offset, bck_offset, sub_const_vec,
+                        round_bits, use_dist_wtd_comp_avg, &d01_u8);
 
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
-        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst8_ptr, vreinterpret_u32_u8(d01_u8), 0);
+        dst8_ptr += dst8_stride;
 
       } else {
-        vst1_u16(d, d0);
-        d += (dst_stride);
+        vst1_u16(dst_ptr, d0);
       }
       s0 = s1;
       s1 = s2;
@@ -703,14 +1036,116 @@ static INLINE void dist_wtd_convolve_2d_vert_neon(
       s4 = s5;
       s5 = s6;
       s6 = s7;
-      height--;
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
 #endif
-    } while (height > 0);
-    src_ptr += 4;
-    dst_ptr += 4;
-    dst_u8_ptr += 4;
-    w -= 4;
-  } while (w > 0);
+    } while (h > 0);
+
+  } else {
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint16x8_t dd0, d0;
+    uint8x8_t d0_u8;
+
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint16x8_t dd1, dd2, dd3, d1, d2, d3;
+    uint8x8_t d1_u8, d2_u8, d3_u8;
+#endif
+
+    do {
+      int16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      s += 7 * src_stride;
+
+      do {
+#if defined(__aarch64__)
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        d0 = convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+        d1 = convolve8_8_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                             round_shift_vec, offset_const);
+        d2 = convolve8_8_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                             round_shift_vec, offset_const);
+        d3 = convolve8_8_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                             round_shift_vec, offset_const);
+
+        if (do_average) {
+          load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+          compute_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                          bck_offset, sub_const_vec, round_bits,
+                          use_dist_wtd_comp_avg, &d0_u8, &d1_u8, &d2_u8,
+                          &d3_u8);
+
+          vst1_u8(d_u8, d0_u8);
+          d_u8 += dst8_stride;
+          vst1_u8(d_u8, d1_u8);
+          d_u8 += dst8_stride;
+          vst1_u8(d_u8, d2_u8);
+          d_u8 += dst8_stride;
+          vst1_u8(d_u8, d3_u8);
+          d_u8 += dst8_stride;
+        } else {
+          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else
+        s7 = vld1q_s16(s);
+
+        d0 = convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+
+        if (do_average) {
+          dd0 = vld1q_u16(d);
+
+          compute_avg_8x1(dd0, d0, fwd_offset, bck_offset, sub_const_vec,
+                          round_bits, use_dist_wtd_comp_avg, &d0_u8);
+
+          vst1_u8(d_u8, d0_u8);
+          d_u8 += dst8_stride;
+
+        } else {
+          vst1q_u16(d, d0);
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif
+      } while (height > 0);
+
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
 }
 
 void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
@@ -725,30 +1160,36 @@ void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
 
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
   const int im_h = h + filter_params_y->taps - 1;
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = filter_params_y->taps / 2 - 1;
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   const int round_0 = conv_params->round_0 - 1;
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
   dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
-                                  x_filter_tmp, im_h, w, round_0);
+                                  x_filter, im_h, w, round_0);
 
-  dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
-                                 conv_params, y_filter, h, w);
+  if (clamped_y_taps == 6) {
+    dist_wtd_convolve_2d_vert_6tap_neon(im_block + im_stride, im_stride, dst8,
+                                        dst8_stride, conv_params, y_filter, h,
+                                        w);
+  } else {
+    dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, dst8, dst8_stride,
+                                        conv_params, y_filter, h, w);
+  }
 }
 
 void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
@@ -869,6 +1310,376 @@ void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
   }
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const int subpel_x_qn,
+                                  ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+  const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+  const int16x8_t horiz_const = vdupq_n_s16(bits);
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  CONV_BUF_TYPE *dst_ptr = dst;
+  uint8_t *dst_u8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int width = w;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      int32x4_t d0, d1, d2, d3;
+      int16x8_t d01, d23;
+      uint16x4_t dd0, dd1, dd2, dd3;
+      uint8x8_t d01_u8, d23_u8;
+
+      s0 = vld1q_u8(src_ptr + 0 * src_stride);
+      s1 = vld1q_u8(src_ptr + 1 * src_stride);
+      s2 = vld1q_u8(src_ptr + 2 * src_stride);
+      s3 = vld1q_u8(src_ptr + 3 * src_stride);
+
+      d0 = convolve8_4_usdot(s0, x_filter, permute_tbl, vdupq_n_s32(0));
+      d1 = convolve8_4_usdot(s1, x_filter, permute_tbl, vdupq_n_s32(0));
+      d2 = convolve8_4_usdot(s2, x_filter, permute_tbl, vdupq_n_s32(0));
+      d3 = convolve8_4_usdot(s3, x_filter, permute_tbl, vdupq_n_s32(0));
+
+      d01 = vcombine_s16(vmovn_s32(d0), vmovn_s32(d1));
+      d23 = vcombine_s16(vmovn_s32(d2), vmovn_s32(d3));
+
+      d01 = vqrshlq_s16(d01, shift_round_0);
+      d23 = vqrshlq_s16(d23, shift_round_0);
+
+      d01 = vrshlq_s16(d01, horiz_const);
+      d23 = vrshlq_s16(d23, horiz_const);
+
+      d01 = vaddq_s16(d01, round_offset128);
+      d23 = vaddq_s16(d23, round_offset128);
+
+      if (conv_params->do_average) {
+        dd0 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+        dd1 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+        dd2 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+        dd3 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+
+        compute_avg_4x4(dd0, dd1, dd2, dd3,
+                        vreinterpret_u16_s16(vget_low_s16(d01)),
+                        vreinterpret_u16_s16(vget_high_s16(d01)),
+                        vreinterpret_u16_s16(vget_low_s16(d23)),
+                        vreinterpret_u16_s16(vget_high_s16(d23)), fwd_offset,
+                        bck_offset, round_offset64, round_bits,
+                        use_dist_wtd_comp_avg, &d01_u8, &d23_u8);
+
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d01_u8), 0);
+        dst_u8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d01_u8), 1);
+        dst_u8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d23_u8), 0);
+        dst_u8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d23_u8), 1);
+        dst_u8_ptr += dst8_stride;
+      } else {
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d01), 0);
+        dst_ptr += dst_stride;
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d01), 1);
+        dst_ptr += dst_stride;
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d23), 0);
+        dst_ptr += dst_stride;
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d23), 1);
+        dst_ptr += dst_stride;
+      }
+
+      src_ptr += 4 * src_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst_u8_ptr;
+      width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        int16x8_t d0, d1, d2, d3;
+        uint16x8_t dd0, dd1, dd2, dd3;
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_usdot(s0, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+        d1 = convolve8_8_usdot(s1, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+        d2 = convolve8_8_usdot(s2, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+        d3 = convolve8_8_usdot(s3, x_filter, permute_tbl, vdupq_n_s32(0),
+                               shift_round_0);
+
+        d0 = vrshlq_s16(d0, horiz_const);
+        d1 = vrshlq_s16(d1, horiz_const);
+        d2 = vrshlq_s16(d2, horiz_const);
+        d3 = vrshlq_s16(d3, horiz_const);
+
+        d0 = vaddq_s16(d0, round_offset128);
+        d1 = vaddq_s16(d1, round_offset128);
+        d2 = vaddq_s16(d2, round_offset128);
+        d3 = vaddq_s16(d3, round_offset128);
+
+        if (conv_params->do_average) {
+          load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+          compute_avg_8x4(dd0, dd1, dd2, dd3, vreinterpretq_u16_s16(d0),
+                          vreinterpretq_u16_s16(d1), vreinterpretq_u16_s16(d2),
+                          vreinterpretq_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+          store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        } else {
+          store_u16_8x4(d, dst_stride, vreinterpretq_u16_s16(d0),
+                        vreinterpretq_u16_s16(d1), vreinterpretq_u16_s16(d2),
+                        vreinterpretq_u16_s16(d3));
+        }
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst_u8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const int subpel_x_qn,
+                                  ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+  const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+  const int16x8_t horiz_const = vdupq_n_s16(bits);
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Dot-product constants.
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
+  const int32x4_t correction = vdupq_n_s32(correction_s32);
+
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  CONV_BUF_TYPE *dst_ptr = dst;
+  uint8_t *dst_u8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int width = w;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      int32x4_t d0, d1, d2, d3;
+      int16x8_t d01, d23;
+      uint16x4_t dd0, dd1, dd2, dd3;
+      uint8x8_t d01_u8, d23_u8;
+
+      s0 = vld1q_u8(src_ptr + 0 * src_stride);
+      s1 = vld1q_u8(src_ptr + 1 * src_stride);
+      s2 = vld1q_u8(src_ptr + 2 * src_stride);
+      s3 = vld1q_u8(src_ptr + 3 * src_stride);
+
+      d0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
+      d1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
+      d2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
+      d3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
+
+      d01 = vcombine_s16(vmovn_s32(d0), vmovn_s32(d1));
+      d23 = vcombine_s16(vmovn_s32(d2), vmovn_s32(d3));
+
+      d01 = vqrshlq_s16(d01, shift_round_0);
+      d23 = vqrshlq_s16(d23, shift_round_0);
+
+      d01 = vrshlq_s16(d01, horiz_const);
+      d23 = vrshlq_s16(d23, horiz_const);
+
+      d01 = vaddq_s16(d01, round_offset128);
+      d23 = vaddq_s16(d23, round_offset128);
+
+      if (conv_params->do_average) {
+        dd0 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+        dd1 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+        dd2 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+        dd3 = vld1_u16(dst_ptr);
+        dst_ptr += dst_stride;
+
+        compute_avg_4x4(dd0, dd1, dd2, dd3,
+                        vreinterpret_u16_s16(vget_low_s16(d01)),
+                        vreinterpret_u16_s16(vget_high_s16(d01)),
+                        vreinterpret_u16_s16(vget_low_s16(d23)),
+                        vreinterpret_u16_s16(vget_high_s16(d23)), fwd_offset,
+                        bck_offset, round_offset64, round_bits,
+                        use_dist_wtd_comp_avg, &d01_u8, &d23_u8);
+
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d01_u8), 0);
+        dst_u8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d01_u8), 1);
+        dst_u8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d23_u8), 0);
+        dst_u8_ptr += dst8_stride;
+        vst1_lane_u32((uint32_t *)dst_u8_ptr, vreinterpret_u32_u8(d23_u8), 1);
+        dst_u8_ptr += dst8_stride;
+      } else {
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d01), 0);
+        dst_ptr += dst_stride;
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d01), 1);
+        dst_ptr += dst_stride;
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d23), 0);
+        dst_ptr += dst_stride;
+        vst1q_lane_u64((uint64_t *)dst_ptr, vreinterpretq_u64_s16(d23), 1);
+        dst_ptr += dst_stride;
+      }
+
+      src_ptr += 4 * src_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst_u8_ptr;
+      width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        int16x8_t d0, d1, d2, d3;
+        uint16x8_t dd0, dd1, dd2, dd3;
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d1 = convolve8_8_sdot(s1, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d2 = convolve8_8_sdot(s2, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+        d3 = convolve8_8_sdot(s3, x_filter, correction, range_limit,
+                              permute_tbl, shift_round_0);
+
+        d0 = vrshlq_s16(d0, horiz_const);
+        d1 = vrshlq_s16(d1, horiz_const);
+        d2 = vrshlq_s16(d2, horiz_const);
+        d3 = vrshlq_s16(d3, horiz_const);
+
+        d0 = vaddq_s16(d0, round_offset128);
+        d1 = vaddq_s16(d1, round_offset128);
+        d2 = vaddq_s16(d2, round_offset128);
+        d3 = vaddq_s16(d3, round_offset128);
+
+        if (conv_params->do_average) {
+          load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+          compute_avg_8x4(dd0, dd1, dd2, dd3, vreinterpretq_u16_s16(d0),
+                          vreinterpretq_u16_s16(d1), vreinterpretq_u16_s16(d2),
+                          vreinterpretq_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+          store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        } else {
+          store_u16_8x4(d, dst_stride, vreinterpretq_u16_s16(d0),
+                        vreinterpretq_u16_s16(d1), vreinterpretq_u16_s16(d2),
+                        vreinterpretq_u16_s16(d3));
+        }
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst_u8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
@@ -892,18 +1703,14 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   // horizontal filter
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - horiz_offset;
 
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
   const uint8_t *s;
   uint8_t *d_u8;
@@ -980,20 +1787,20 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
         s9 = vget_high_s16(u0);
         s10 = vget_high_s16(u1);
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                               zero, shift_round_0);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+                               shift_round_0);
         d0 = vrshl_s16(d0, horiz_const);
         d0 = vadd_s16(d0, round_offset_vec);
-        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                               zero, shift_round_0);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, zero,
+                               shift_round_0);
         d1 = vrshl_s16(d1, horiz_const);
         d1 = vadd_s16(d1, round_offset_vec);
-        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                               zero, shift_round_0);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, zero,
+                               shift_round_0);
         d2 = vrshl_s16(d2, horiz_const);
         d2 = vadd_s16(d2, round_offset_vec);
-        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                               zero, shift_round_0);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, zero,
+                               shift_round_0);
         d3 = vrshl_s16(d3, horiz_const);
         d3 = vadd_s16(d3, round_offset_vec);
 
@@ -1073,8 +1880,8 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
         s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
         s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                               zero, shift_round_0);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+                               shift_round_0);
         d0 = vrshl_s16(d0, horiz_const);
         d0 = vadd_s16(d0, round_offset_vec);
         s0 = s4;
@@ -1173,38 +1980,38 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 zero, shift_round_0);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+                                 shift_round_0);
 
         res0 = vrshlq_s16(res0, horiz_const);
         res0 = vaddq_s16(res0, round_offset128);
 
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                                 zero, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, zero,
+                                 shift_round_0);
         res1 = vrshlq_s16(res1, horiz_const);
         res1 = vaddq_s16(res1, round_offset128);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                                 zero, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, zero,
+                                 shift_round_0);
         res2 = vrshlq_s16(res2, horiz_const);
         res2 = vaddq_s16(res2, round_offset128);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                                  zero, shift_round_0);
         res3 = vrshlq_s16(res3, horiz_const);
         res3 = vaddq_s16(res3, round_offset128);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
                                  zero, shift_round_0);
         res4 = vrshlq_s16(res4, horiz_const);
         res4 = vaddq_s16(res4, round_offset128);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, zero, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                 zero, shift_round_0);
         res5 = vrshlq_s16(res5, horiz_const);
         res5 = vaddq_s16(res5, round_offset128);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, zero, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                 zero, shift_round_0);
         res6 = vrshlq_s16(res6, horiz_const);
         res6 = vaddq_s16(res6, round_offset128);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, zero, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+                                 zero, shift_round_0);
         res7 = vrshlq_s16(res7, horiz_const);
         res7 = vaddq_s16(res7, round_offset128);
 
@@ -1293,8 +2100,8 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
         s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
         s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
-                                 x_filter_tmp, zero, shift_round_0);
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 zero, shift_round_0);
 
         res0 = vrshlq_s16(res0, horiz_const);
         res0 = vaddq_s16(res0, round_offset128);
@@ -1328,6 +2135,8 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
   }
 }
 
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_y,
@@ -1352,18 +2161,14 @@ void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
   // vertical filter
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - (vert_offset * src_stride);
 
-  int16_t y_filter_tmp[8];
-  int16x8_t filter_y_coef = vld1q_s16(y_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
-  vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
 
   const uint8_t *s;
   uint8_t *d_u8;
@@ -1441,17 +2246,17 @@ void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
         s9 = vget_low_s16(u1);
         s10 = vget_high_s16(u1);
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                               zero, shift_vec);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                               shift_vec);
         d0 = vadd_s16(d0, round_offset64);
-        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
-                               zero, shift_vec);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+                               shift_vec);
         d1 = vadd_s16(d1, round_offset64);
-        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
-                               zero, shift_vec);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+                               shift_vec);
         d2 = vadd_s16(d2, round_offset64);
-        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
-                               zero, shift_vec);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, zero,
+                               shift_vec);
         d3 = vadd_s16(d3, round_offset64);
 
         if (conv_params->do_average) {
@@ -1504,8 +2309,8 @@ void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
         u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
         s7 = vget_low_s16(u0);
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                               zero, shift_vec);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                               shift_vec);
 
         d0 = vadd_s16(d0, round_offset64);
 
@@ -1602,29 +2407,29 @@ void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
         __builtin_prefetch(dst_ptr + 2 * dst_stride);
         __builtin_prefetch(dst_ptr + 3 * dst_stride);
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                                 zero, shift_vec);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                                 shift_vec);
         res0 = vaddq_s16(res0, round_offset128);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
-                                 zero, shift_vec);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+                                 shift_vec);
         res1 = vaddq_s16(res1, round_offset128);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
-                                 zero, shift_vec);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+                                 shift_vec);
         res2 = vaddq_s16(res2, round_offset128);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
                                  zero, shift_vec);
         res3 = vaddq_s16(res3, round_offset128);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
                                  zero, shift_vec);
         res4 = vaddq_s16(res4, round_offset128);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 y_filter_tmp, zero, shift_vec);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
+                                 zero, shift_vec);
         res5 = vaddq_s16(res5, round_offset128);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 y_filter_tmp, zero, shift_vec);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
+                                 zero, shift_vec);
         res6 = vaddq_s16(res6, round_offset128);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 y_filter_tmp, zero, shift_vec);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
+                                 zero, shift_vec);
         res7 = vaddq_s16(res7, round_offset128);
 
         if (conv_params->do_average) {
@@ -1682,8 +2487,8 @@ void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
 
         __builtin_prefetch(dst_ptr);
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                                 zero, shift_vec);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                                 shift_vec);
         res0 = vaddq_s16(res0, round_offset128);
 
         s0 = s1;
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index 190a3b289..df4f7d645 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -694,13 +694,42 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
   } while (x);
 }
 
+// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON.
+static INLINE bool has_normative_scaler_neon(const int src_width,
+                                             const int src_height,
+                                             const int dst_width,
+                                             const int dst_height) {
+  const bool has_normative_scaler =
+      (2 * dst_width == src_width && 2 * dst_height == src_height) ||
+      (4 * dst_width == src_width && 4 * dst_height == src_height) ||
+      (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height);
+
+  return has_normative_scaler;
+}
+
 void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
                                       YV12_BUFFER_CONFIG *dst,
                                       const InterpFilter filter,
                                       const int phase, const int num_planes) {
+  bool has_normative_scaler =
+      has_normative_scaler_neon(src->y_crop_width, src->y_crop_height,
+                                dst->y_crop_width, dst->y_crop_height);
+
+  if (num_planes > 1) {
+    has_normative_scaler =
+        has_normative_scaler &&
+        has_normative_scaler_neon(src->uv_crop_width, src->uv_crop_height,
+                                  dst->uv_crop_width, dst->uv_crop_height);
+  }
+
+  if (!has_normative_scaler) {
+    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+    return;
+  }
+
   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
   // the static analysis warnings.
-  int scaled = 0;
+  int malloc_failed = 0;
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
     const int src_w = src->crop_widths[is_uv];
@@ -711,7 +740,6 @@ void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
     const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
 
     if (2 * dst_w == src_w && 2 * dst_h == src_h) {
-      scaled = 1;
       if (phase == 0) {
         scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
                                    dst->buffers[i], dst->strides[is_uv], dst_w,
@@ -727,21 +755,19 @@ void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
         const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        } else {
-          scaled = 0;
+        if (!temp_buffer) {
+          malloc_failed = 1;
+          break;
         }
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel[phase], temp_buffer);
+        free(temp_buffer);
       }
     } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
-      scaled = 1;
       if (phase == 0) {
         scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
                                    dst->buffers[i], dst->strides[is_uv], dst_w,
@@ -758,46 +784,45 @@ void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height);
         if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        } else {
-          scaled = 0;
+          malloc_failed = 1;
+          break;
         }
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel[phase], temp_buffer);
+        free(temp_buffer);
       }
-    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    } else {
+      assert(4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h);
       // 4 to 3
       const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
       const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
       uint8_t *const temp_buffer =
           (uint8_t *)malloc(buffer_stride * buffer_height);
-      if (temp_buffer) {
-        scaled = 1;
-        if (filter == BILINEAR) {
-          scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv],
-                                      dst->buffers[i], dst->strides[is_uv],
-                                      dst_w, dst_h, phase, temp_buffer);
-        } else {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel, phase,
-                                     temp_buffer);
-        }
-        free(temp_buffer);
+      if (!temp_buffer) {
+        malloc_failed = 1;
+        break;
+      }
+      if (filter == BILINEAR) {
+        scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, phase, temp_buffer);
       } else {
-        scaled = 0;
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel, phase, temp_buffer);
       }
+      free(temp_buffer);
     }
   }
-  if (!scaled) {
+
+  if (malloc_failed) {
     av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
   } else {
     aom_extend_frame_borders(dst, num_planes);
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index 06e7555f5..0a12c8837 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -22,25 +22,103 @@
 #include "av1/common/common.h"
 #include "av1/common/arm/convolve_neon.h"
 
+#define HORZ_FILTERING_CORE(t0, t1, t2, t3, t4, t5, t6, res)                 \
+  res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t1));                            \
+  res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t3));                            \
+  res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t5));                            \
+  res3 = vreinterpretq_s16_u16(vmovl_u8(t6));                                \
+  res = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, bd, \
+                                   conv_params->round_0);
+
+#define PROCESS_ROW_FOR_VERTICAL_FILTER                                      \
+  __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);                          \
+                                                                             \
+  do {                                                                       \
+    s7 = vld1q_s16(s);                                                       \
+    s += src_stride;                                                         \
+                                                                             \
+    t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, \
+                                   bd, conv_params->round_1);                \
+    vst1_u8(d, t0);                                                          \
+    d += dst_stride;                                                         \
+                                                                             \
+    s0 = s1;                                                                 \
+    s1 = s2;                                                                 \
+    s2 = s3;                                                                 \
+    s3 = s4;                                                                 \
+    s4 = s5;                                                                 \
+    s5 = s6;                                                                 \
+    s6 = s7;                                                                 \
+    height--;                                                                \
+  } while (height > 0);
+
+static INLINE void process_row_for_horz_filtering(
+    uint16_t *dst_ptr, int16_t *filter_x, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, ptrdiff_t dst_stride, int round0_bits, int w,
+    int height, int bd) {
+  do {
+    __builtin_prefetch(src_ptr);
+
+    uint8x8_t tt0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+
+    __builtin_prefetch(dst_ptr);
+
+    const uint8_t *ss = src_ptr + 8;
+    uint16_t *d_tmp = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x8_t tt7 = vld1_u8(ss);  // a8 a9 a10 a11 a12 a13 a14 a15
+      uint8x8_t ttemp_0 = tt0;
+      tt0 = tt7;
+
+      uint8x8_t tt1 = vext_u8(ttemp_0, tt7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      uint8x8_t tt2 = vext_u8(ttemp_0, tt7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      uint8x8_t tt3 = vext_u8(ttemp_0, tt7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      uint8x8_t tt4 = vext_u8(ttemp_0, tt7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      uint8x8_t tt5 = vext_u8(ttemp_0, tt7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      uint8x8_t tt6 = vext_u8(ttemp_0, tt7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      tt7 = vext_u8(ttemp_0, tt7, 7);            // a7 a8 a9 a10 a11 a12 a13 a14
+
+      int16x8_t ttt0 = vreinterpretq_s16_u16(vaddl_u8(ttemp_0, tt6));
+      int16x8_t ttt1 = vreinterpretq_s16_u16(vaddl_u8(tt1, tt5));
+      int16x8_t ttt2 = vreinterpretq_s16_u16(vaddl_u8(tt2, tt4));
+      int16x8_t ttt3 = vreinterpretq_s16_u16(vmovl_u8(tt3));
+      uint16x8_t dd0 = wiener_convolve8_horiz_8x8(ttt0, ttt1, ttt2, ttt3,
+                                                  filter_x, bd, round0_bits);
+
+      vst1q_u16(d_tmp, dd0);
+
+      ss += 8;
+      d_tmp += 8;
+      width -= 8;
+    } while (width > 0);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    height--;
+  } while (height > 0);
+}
+
 /* Wiener filter 2D
    Apply horizontal filter and store in a temporary buffer. When applying
    vertical filter, overwrite the original pixel values.
- */
+*/
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h,
                                       const ConvolveParams *conv_params) {
-  uint16_t *d_tmp;
   uint8_t *d;
   const uint8_t *src_ptr, *s_tmp;
   uint16_t *dst_ptr;
   (void)x_step_q4;
   (void)y_step_q4;
 
-  int width, height;
+  int height;
   const int bd = 8;
+  // Indicates the height needs to be processed during horizontal filtering.
   const int intermediate_height = h + SUBPEL_TAPS - 1;
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   int16_t filter_x_tmp[7], filter_y_tmp[7];
@@ -74,15 +152,15 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
   src_ptr = s_tmp;
   height = intermediate_height;
 
-  /* if height is a multiple of 8 */
-  if (!(h & 7)) {
-    int16x8_t res0, res1, res2, res3;
-    uint16x8_t res4;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+  // For aarch_64.
 #if defined(__aarch64__)
-    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
-    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
-
+  int processed_height = 0;
+  uint16_t *d_tmp;
+  int width, remaining_height;
+  // Start of horizontal filtering.
+  if (intermediate_height > 7) {
+    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
     do {
       const uint8_t *s;
 
@@ -112,64 +190,19 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(dst_ptr + 7 * dst_stride);
 
       do {
+        int16x8_t res0, res1, res2, res3;
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14;
         load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
         transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
 
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
-        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
-        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
-        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                           bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
-        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                           bd, conv_params->round_0);
+        HORZ_FILTERING_CORE(t0, t6, t1, t5, t2, t4, t3, res4)
+        HORZ_FILTERING_CORE(t1, t7, t2, t6, t3, t5, t4, res5)
+        HORZ_FILTERING_CORE(t2, t8, t3, t7, t4, t6, t5, res6)
+        HORZ_FILTERING_CORE(t3, t9, t4, t8, t5, t7, t6, res7)
+        HORZ_FILTERING_CORE(t4, t10, t5, t9, t6, t8, t7, res8)
+        HORZ_FILTERING_CORE(t5, t11, t6, t10, t7, t9, t8, res9)
+        HORZ_FILTERING_CORE(t6, t12, t7, t11, t8, t10, t9, res10)
+        HORZ_FILTERING_CORE(t7, t13, t8, t12, t9, t11, t10, res11)
 
         transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
                           &res11);
@@ -190,212 +223,19 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       src_ptr += 8 * src_stride;
       dst_ptr += 8 * MAX_SB_SIZE;
       height -= 8;
-    } while (height > 0);
-#else
-    uint8x8_t temp_0;
-
-    do {
-      const uint8_t *s;
-
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
-      width = w;
-
-      __builtin_prefetch(dst_ptr);
-
-      do {
-        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        temp_0 = t0;
-        t0 = t7;
-
-        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        vst1q_u16(d_tmp, res4);
-
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += MAX_SB_SIZE;
-      height--;
-    } while (height > 0);
-#endif
-  } else {
-    /*if height is a multiple of 4*/
-    const uint8_t *s;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint16x8_t d0;
-    uint8x8_t t0, t1, t2, t3;
-
-#if defined(__aarch64__)
-    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t d1, d2, d3;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x4_t s11, s12, s13, s14;
-    do {
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
-      transpose_u8_8x4(&t0, &t1, &t2,
-                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
-
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
-      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
-      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
-      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
-      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
-      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
-      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
-        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
-        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
-        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
-        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
-        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
-        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
-        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
-
-        res0 = wiener_convolve8_horiz_4x8(
-            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
-        res1 = wiener_convolve8_horiz_4x8(
-            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
-        res2 = wiener_convolve8_horiz_4x8(
-            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
-        res3 = wiener_convolve8_horiz_4x8(
-            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
-        res4 =
-            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res5 =
-            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res6 =
-            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res7 =
-            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
-                                       filter_x_tmp, bd, conv_params->round_0);
-
-        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
-                          &res7, &d0, &d1, &d2, &d3);
-
-        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * MAX_SB_SIZE;
-      height -= 4;
-    } while (height > 0);
-#else
-    uint8x8_t temp_0, t4, t5, t6, t7;
-
-    do {
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
-        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        temp_0 = t0;
-        t0 = t7;
-
-        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
-        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
-                                        conv_params->round_0);
-
-        vst1q_u16(d_tmp, d0);
-
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += src_stride;
-      dst_ptr += MAX_SB_SIZE;
-      height -= 1;
-    } while (height > 0);
-#endif
+      processed_height += 8;
+    } while (height > 7);
   }
 
+  // Process the remaining rows for horizontal filtering.
+  remaining_height = intermediate_height - processed_height;
+  if (remaining_height)
+    process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
+                                   MAX_SB_SIZE, conv_params->round_0, w, height,
+                                   bd);
+
+  // Start of vertical filtering.
   {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint8x8_t t0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10;
-    uint8x8_t t1, t2, t3;
-#endif
     int16_t *src_tmp_ptr, *s;
     uint8_t *dst_tmp_ptr;
     height = h;
@@ -405,6 +245,8 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
     src_stride = MAX_SB_SIZE;
 
     do {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t t0;
       s = src_tmp_ptr;
       s0 = vld1q_s16(s);
       s += src_stride;
@@ -423,8 +265,9 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       d = dst_tmp_ptr;
       height = h;
 
-#if defined(__aarch64__)
       do {
+        int16x8_t s8, s9, s10;
+        uint8x8_t t1, t2, t3;
         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
         __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
         __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
@@ -467,64 +310,55 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
         height -= 4;
       } while (height > 3);
 
-      if (height != 0) {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
-
-        do {
-          s7 = vld1q_s16(s);
-          s += src_stride;
-
-          t0 =
-              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
-                                        filter_y_tmp, bd, conv_params->round_1);
-          vst1_u8(d, t0);
-          d += dst_stride;
-
-          s0 = s1;
-          s1 = s2;
-          s2 = s3;
-          s3 = s4;
-          s4 = s5;
-          s5 = s6;
-          s6 = s7;
-          height -= 1;
-        } while (height > 0);
+      if (height) {
+        PROCESS_ROW_FOR_VERTICAL_FILTER
       }
-
       src_tmp_ptr += 8;
       dst_tmp_ptr += 8;
-
       w -= 8;
     } while (w > 0);
+  }
 #else
-      do {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-
-        s7 = vld1q_s16(s);
-        s += src_stride;
-
-        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
-                                       bd, conv_params->round_1);
+  // Start of horizontal filtering.
+  process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
+                                 MAX_SB_SIZE, conv_params->round_0, w, height,
+                                 bd);
 
-        vst1_u8(d, t0);
-        d += dst_stride;
+  // Start of vertical filtering.
+  {
+    int16_t *src_tmp_ptr, *s;
+    uint8_t *dst_tmp_ptr;
+    src_tmp_ptr = (int16_t *)temp;
+    dst_tmp_ptr = dst;
+    src_stride = MAX_SB_SIZE;
 
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        height -= 1;
-      } while (height > 0);
+    do {
+      uint8x8_t t0;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      s = src_tmp_ptr;
+      s0 = vld1q_s16(s);
+      s += src_stride;
+      s1 = vld1q_s16(s);
+      s += src_stride;
+      s2 = vld1q_s16(s);
+      s += src_stride;
+      s3 = vld1q_s16(s);
+      s += src_stride;
+      s4 = vld1q_s16(s);
+      s += src_stride;
+      s5 = vld1q_s16(s);
+      s += src_stride;
+      s6 = vld1q_s16(s);
+      s += src_stride;
+      d = dst_tmp_ptr;
+      height = h;
+      PROCESS_ROW_FOR_VERTICAL_FILTER
 
       src_tmp_ptr += 8;
       dst_tmp_ptr += 8;
 
       w -= 8;
     } while (w > 0);
-#endif
   }
+#endif
 }
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index fd2ec069f..f3f2823ef 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -301,7 +301,7 @@ typedef struct SequenceHeader {
   aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
                               // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
   uint8_t use_highbitdepth;   // If true, we need to use 16bit frame buffers.
-  uint8_t monochrome;         // Monochorme video
+  uint8_t monochrome;         // Monochrome video
   aom_color_primaries_t color_primaries;
   aom_transfer_characteristics_t transfer_characteristics;
   aom_matrix_coefficients_t matrix_coefficients;
@@ -461,11 +461,11 @@ typedef struct CommonTileParams {
    */
   int min_log2_rows;
   /*!
-   * Min num of tile columns possible based on frame width.
+   * Max num of tile columns possible based on frame width.
    */
   int max_log2_cols;
   /*!
-   * Max num of tile columns possible based on frame width.
+   * Max num of tile rows possible based on frame height.
    */
   int max_log2_rows;
   /*!
@@ -660,7 +660,7 @@ struct CommonQuantParams {
    */
   /**@{*/
   /*!
-   * Global dquantization matrix table.
+   * Global dequantization matrix table.
    */
   const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
   /*!
@@ -760,7 +760,7 @@ typedef struct AV1Common {
   /*!
    * AV1 allows two types of frame scaling operations:
    * 1. Frame super-resolution: that allows coding a frame at lower resolution
-   * and after decoding the frame, normatively uscales and restores the frame --
+   * and after decoding the frame, normatively scales and restores the frame --
    * inside the coding loop.
    * 2. Frame resize: that allows coding frame at lower/higher resolution, and
    * then non-normatively upscale the frame at the time of rendering -- outside
@@ -1374,7 +1374,7 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
   xd->is_chroma_ref = chroma_ref;
   if (chroma_ref) {
     // To help calculate the "above" and "left" chroma blocks, note that the
-    // current block may cover multiple luma blocks (eg, if partitioned into
+    // current block may cover multiple luma blocks (e.g., if partitioned into
     // 4x4 luma blocks).
     // First, find the top-left-most luma block covered by this chroma block
     MB_MODE_INFO **base_mi =
@@ -1871,9 +1871,7 @@ static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
           // The following levels are currently undefined.
           seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
           seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
-          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 &&
-          seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
-          seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
+          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3);
 }
 
 /*!\endcond */
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 5dede53d3..5af025c65 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -1350,13 +1350,11 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
   }
 }
 
-void av1_filter_block_plane_vert_opt(const AV1_COMMON *const cm,
-                                     const MACROBLOCKD *const xd,
-                                     const MACROBLOCKD_PLANE *const plane_ptr,
-                                     const uint32_t mi_row,
-                                     const uint32_t mi_col,
-                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
-                                     TX_SIZE *tx_buf) {
+void av1_filter_block_plane_vert_opt(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) {
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
   // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
@@ -1365,7 +1363,14 @@ void av1_filter_block_plane_vert_opt(const AV1_COMMON *const cm,
       CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2);
   const int plane_mi_rows =
       CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2);
-  const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), MAX_MIB_SIZE);
+  // Whenever 'pipeline_lpf_mt_with_enc' is enabled, height of the unit to
+  // filter (i.e., y_range) is calculated based on the size of the superblock
+  // used.
+  const int y_range = AOMMIN((int)(plane_mi_rows - mi_row),
+                             (1 << num_mis_in_lpf_unit_height_log2));
+  // Width of the unit to filter (i.e., x_range) should always be calculated
+  // based on maximum superblock size as this function is called for mi_col = 0,
+  // MAX_MIB_SIZE, 2 * MAX_MIB_SIZE etc.
   const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE);
   const ptrdiff_t mode_step = 1;
   for (int y = 0; y < y_range; y++) {
@@ -1417,7 +1422,8 @@ void av1_filter_block_plane_vert_opt_chroma(
     const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
     const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
     const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
-    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma) {
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+    int num_mis_in_lpf_unit_height_log2) {
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   const int dst_stride = plane_ptr->dst.stride;
@@ -1429,8 +1435,9 @@ void av1_filter_block_plane_vert_opt_chroma(
       ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2;
   const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert);
   const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz);
-  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
-                             (MAX_MIB_SIZE >> scale_vert));
+  const int y_range =
+      AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+             ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert));
   const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
                              (MAX_MIB_SIZE >> scale_horz));
   const ptrdiff_t mode_step = (ptrdiff_t)1 << scale_horz;
@@ -1943,13 +1950,11 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
   }
 }
 
-void av1_filter_block_plane_horz_opt(const AV1_COMMON *const cm,
-                                     const MACROBLOCKD *const xd,
-                                     const MACROBLOCKD_PLANE *const plane_ptr,
-                                     const uint32_t mi_row,
-                                     const uint32_t mi_col,
-                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
-                                     TX_SIZE *tx_buf) {
+void av1_filter_block_plane_horz_opt(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) {
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
   // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
@@ -1958,7 +1963,8 @@ void av1_filter_block_plane_horz_opt(const AV1_COMMON *const cm,
       CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2);
   const int plane_mi_rows =
       CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2);
-  const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), MAX_MIB_SIZE);
+  const int y_range = AOMMIN((int)(plane_mi_rows - mi_row),
+                             (1 << num_mis_in_lpf_unit_height_log2));
   const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE);
 
   const ptrdiff_t mode_step = cm->mi_params.mi_stride;
@@ -2011,7 +2017,8 @@ void av1_filter_block_plane_horz_opt_chroma(
     const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
     const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
     const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
-    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma) {
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+    int num_mis_in_lpf_unit_height_log2) {
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   const int dst_stride = plane_ptr->dst.stride;
@@ -2023,8 +2030,9 @@ void av1_filter_block_plane_horz_opt_chroma(
       ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2;
   const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert);
   const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz);
-  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
-                             (MAX_MIB_SIZE >> scale_vert));
+  const int y_range =
+      AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+             ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert));
   const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
                              (MAX_MIB_SIZE >> scale_horz));
   const ptrdiff_t mode_step = cm->mi_params.mi_stride << scale_vert;
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 43bd780eb..78443c798 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -110,33 +110,31 @@ void av1_filter_block_plane_horz(const struct AV1Common *const cm,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
 
-void av1_filter_block_plane_vert_opt(const struct AV1Common *const cm,
-                                     const MACROBLOCKD *const xd,
-                                     const MACROBLOCKD_PLANE *const plane_ptr,
-                                     const uint32_t mi_row,
-                                     const uint32_t mi_col,
-                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
-                                     TX_SIZE *tx_buf);
+void av1_filter_block_plane_vert_opt(
+    const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2);
 
 void av1_filter_block_plane_vert_opt_chroma(
     const struct AV1Common *const cm, const MACROBLOCKD *const xd,
     const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
     const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
-    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma);
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+    int num_mis_in_lpf_unit_height_log2);
 
-void av1_filter_block_plane_horz_opt(const struct AV1Common *const cm,
-                                     const MACROBLOCKD *const xd,
-                                     const MACROBLOCKD_PLANE *const plane_ptr,
-                                     const uint32_t mi_row,
-                                     const uint32_t mi_col,
-                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
-                                     TX_SIZE *tx_buf);
+void av1_filter_block_plane_horz_opt(
+    const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2);
 
 void av1_filter_block_plane_horz_opt_chroma(
     const struct AV1Common *const cm, const MACROBLOCKD *const xd,
     const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
     const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
-    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma);
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+    int num_mis_in_lpf_unit_height_log2);
 
 uint8_t av1_get_filter_level(const struct AV1Common *cm,
                              const loop_filter_info_n *lfi_n, const int dir_idx,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index c9e87e3db..ba1dcbb7c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -403,7 +403,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter sse2 avx2/;
+    specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
       specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
@@ -412,12 +412,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
-  add_proto qw/void av1_calc_indices_dim1/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
+  add_proto qw/void av1_calc_indices_dim1/, "const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k";
   specialize qw/av1_calc_indices_dim1 sse2 avx2/;
 
-  # TODO(any): Disable av1_calc_indices_dim2 sse2 version due to c/SIMD mismatch. Re-enable it after mismatch is fixed.
-  add_proto qw/void av1_calc_indices_dim2/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
-  specialize qw/av1_calc_indices_dim2 avx2/;
+  add_proto qw/void av1_calc_indices_dim2/, "const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k";
+  specialize qw/av1_calc_indices_dim2 sse2 avx2/;
 
   # ENCODEMB INVOKE
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -442,7 +441,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/av1_txb_init_levels sse4_1 avx2 neon/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
-  specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
+  specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon/;
   add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
   specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
@@ -450,7 +449,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   # hash
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
-  specialize qw/av1_get_crc32c_value sse4_2/;
+  specialize qw/av1_get_crc32c_value sse4_2 arm_crc32/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
@@ -519,8 +518,8 @@ add_proto qw/void cdef_filter_16_1/, "void *dst16, int dstride, const uint16_t *
 add_proto qw/void cdef_filter_16_2/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
 add_proto qw/void cdef_filter_16_3/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
 
-add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
-add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height";
+add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height";
 
 # VS compiling for 32 bit targets does not support vector types in
 # structs as arguments, which makes the v256 type of the intrinsics
@@ -555,11 +554,6 @@ specialize qw/av1_warp_affine sse4_1 avx2 neon/;
 add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
 specialize qw/av1_calc_frame_error sse2 avx2/;
 
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
-  specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
-}
-
 # LOOP_RESTORATION functions
 add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
 specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index b2e72d2e4..5f90e5707 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -188,6 +188,7 @@ static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
 
 typedef struct RD_STATS {
   int rate;
+  int zero_rate;
   int64_t dist;
   // Please be careful of using rdcost, it's not guaranteed to be set all the
   // time.
@@ -196,8 +197,7 @@ typedef struct RD_STATS {
   // rate/dist.
   int64_t rdcost;
   int64_t sse;
-  int skip_txfm;  // sse should equal to dist when skip_txfm == 1
-  int zero_rate;
+  uint8_t skip_txfm;  // sse should equal to dist when skip_txfm == 1
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
 #endif  // CONFIG_RD_DEBUG
@@ -285,7 +285,7 @@ typedef struct MB_MODE_INFO {
    ****************************************************************************/
   /**@{*/
   /*! \brief Whether to skip transforming and sending. */
-  int8_t skip_txfm;
+  uint8_t skip_txfm;
   /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */
   TX_SIZE tx_size;
   /*! \brief Transform size when recursive txfm tree is on. */
@@ -326,9 +326,6 @@ typedef struct MB_MODE_INFO {
   int8_t cdef_strength : 4;
   /**@}*/
 
-  /*! \brief Skip CDEF for this superblock */
-  uint8_t skip_cdef_curr_sb;
-
 #if CONFIG_RD_DEBUG
   /*! \brief RD info used for debugging */
   RD_STATS rd_stats;
@@ -1196,7 +1193,7 @@ static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
   assert(bsize < BLOCK_SIZES_ALL);
   assert(subsampling_x >= 0 && subsampling_x < 2);
   assert(subsampling_y >= 0 && subsampling_y < 2);
-  return ss_size_lookup[bsize][subsampling_x][subsampling_y];
+  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 }
 
 /*
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 7807bb739..202f9d6da 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -68,44 +68,51 @@ int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
 }
 
 void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
-                                     const uint8_t *src, int sstride, int v,
-                                     int h) {
-  for (int i = 0; i < v; i++) {
-    for (int j = 0; j < h; j++) {
+                                     const uint8_t *src, int sstride, int width,
+                                     int height) {
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
 }
 
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
-                                      const uint16_t *src, int sstride, int v,
-                                      int h) {
-  for (int i = 0; i < v; i++) {
-    for (int j = 0; j < h; j++) {
+                                      const uint16_t *src, int sstride,
+                                      int width, int height) {
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
 }
 
+void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride,
+                                const uint8_t *src, int src_voffset,
+                                int src_hoffset, int sstride, int vsize,
+                                int hsize) {
+  const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+  cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
+}
+
+void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride,
+                                 const uint8_t *src, int src_voffset,
+                                 int src_hoffset, int sstride, int vsize,
+                                 int hsize) {
+  const uint16_t *base =
+      &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+  cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
+}
+
 void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
                           int dstride, const uint8_t *src, int src_voffset,
                           int src_hoffset, int sstride, int vsize, int hsize) {
   if (cm->seq_params->use_highbitdepth) {
-    const uint16_t *base =
-        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
-    cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+    av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset,
+                                sstride, vsize, hsize);
   } else {
-    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
-    cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
-  }
-}
-
-static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
-                             uint16_t x) {
-  for (int i = 0; i < v; i++) {
-    for (int j = 0; j < h; j++) {
-      dst[i * dstride + j] = x;
-    }
+    av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset,
+                               sstride, vsize, hsize);
   }
 }
 
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index 5bf40e471..e166f4b20 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -89,7 +89,7 @@ typedef void (*cdef_init_fb_row_t)(
  * \param[in]       xd        Pointer to common current coding block structure
  * \param[in]       cdef_init_fb_row_fn   Function Pointer
  *
- * \return Nothing is returned. Instead, the filtered frame is output in
+ * \remark Nothing is returned. Instead, the filtered frame is output in
  * \c frame.
  */
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index 073167807..ce7039f37 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -292,7 +292,7 @@ static INLINE int adjust_strength(int strength, int32_t var) {
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
-static AOM_INLINE void aom_cdef_find_dir(uint16_t *in, cdef_list *dlist,
+static AOM_INLINE void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist,
                                          int var[CDEF_NBLOCKS][CDEF_NBLOCKS],
                                          int cdef_count, int coeff_shift,
                                          int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
@@ -321,7 +321,7 @@ static AOM_INLINE void aom_cdef_find_dir(uint16_t *in, cdef_list *dlist,
 }
 
 void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
-                        uint16_t *in, int xdec, int ydec,
+                        const uint16_t *in, int xdec, int ydec,
                         int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
                         int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
                         cdef_list *dlist, int cdef_count, int level,
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 679f1ef2a..455a896f4 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -51,9 +51,18 @@ void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                               cdef_list *dlist, int cdef_count, int bsize);
 
 void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
-                        uint16_t *in, int xdec, int ydec,
+                        const uint16_t *in, int xdec, int ydec,
                         int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
                         int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
                         cdef_list *dlist, int cdef_count, int level,
                         int sec_strength, int damping, int coeff_shift);
+
+static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+                             uint16_t x) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = x;
+    }
+  }
+}
 #endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 73119e2fa..df6787177 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -197,8 +197,16 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
   return best_dir;
 }
 
+// Work around compiler out of memory issues with Win32 builds. This issue has
+// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4).
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1940
+#define CDEF_INLINE static INLINE
+#else
+#define CDEF_INLINE SIMD_INLINE
+#endif
+
 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
-SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
+CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
                              unsigned int adjdamp) {
   v256 diff = v256_sub_16(a, b);
   const v256 sign = v256_shr_n_s16(diff, 15);
@@ -262,7 +270,7 @@ SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max,
   return max;
 }
 
-SIMD_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
+CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
                                   const uint16_t *in, int pri_strength,
                                   int sec_strength, int dir, int pri_damping,
                                   int sec_damping, int coeff_shift, int height,
@@ -454,7 +462,7 @@ SIMD_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
   }
 }
 
-SIMD_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
+CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
                                   const uint16_t *in, int pri_strength,
                                   int sec_strength, int dir, int pri_damping,
                                   int sec_damping, int coeff_shift, int height,
@@ -816,17 +824,19 @@ void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in,
 
 void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
                                                const uint16_t *src, int sstride,
-                                               int v, int h) {
+                                               int width, int height) {
   int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < (width & ~0x7); j += 8) {
       v128 row = v128_load_unaligned(&src[i * sstride + j]);
       v128_store_unaligned(&dst[i * dstride + j], row);
     }
-    for (; j < h; j++) {
+    for (; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
 }
 
+#undef CDEF_INLINE
+
 #endif  // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/av1/common/common_data.c b/av1/common/common_data.c
new file mode 100644
index 000000000..482aecfcc
--- /dev/null
+++ b/av1/common/common_data.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
+/* clang-format off */
+const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
+  //  ss_x == 0      ss_x == 0          ss_x == 1      ss_x == 1
+  //  ss_y == 0      ss_y == 1          ss_y == 0      ss_y == 1
+  { { BLOCK_4X4,     BLOCK_4X4 },     { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_4X8,     BLOCK_4X4 },     { BLOCK_INVALID, BLOCK_4X4 } },
+  { { BLOCK_8X4,     BLOCK_INVALID }, { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_8X8,     BLOCK_8X4 },     { BLOCK_4X8,     BLOCK_4X4 } },
+  { { BLOCK_8X16,    BLOCK_8X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X8,    BLOCK_INVALID }, { BLOCK_8X8,     BLOCK_8X4 } },
+  { { BLOCK_16X16,   BLOCK_16X8 },    { BLOCK_8X16,    BLOCK_8X8 } },
+  { { BLOCK_16X32,   BLOCK_16X16 },   { BLOCK_INVALID, BLOCK_8X16 } },
+  { { BLOCK_32X16,   BLOCK_INVALID }, { BLOCK_16X16,   BLOCK_16X8 } },
+  { { BLOCK_32X32,   BLOCK_32X16 },   { BLOCK_16X32,   BLOCK_16X16 } },
+  { { BLOCK_32X64,   BLOCK_32X32 },   { BLOCK_INVALID, BLOCK_16X32 } },
+  { { BLOCK_64X32,   BLOCK_INVALID }, { BLOCK_32X32,   BLOCK_32X16 } },
+  { { BLOCK_64X64,   BLOCK_64X32 },   { BLOCK_32X64,   BLOCK_32X32 } },
+  { { BLOCK_64X128,  BLOCK_64X64 },   { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64,  BLOCK_INVALID }, { BLOCK_64X64,   BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 },  { BLOCK_64X128,  BLOCK_64X64 } },
+  { { BLOCK_4X16,    BLOCK_4X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X4,    BLOCK_INVALID }, { BLOCK_8X4,     BLOCK_8X4 } },
+  { { BLOCK_8X32,    BLOCK_8X16 },    { BLOCK_INVALID, BLOCK_4X16 } },
+  { { BLOCK_32X8,    BLOCK_INVALID }, { BLOCK_16X8,    BLOCK_16X4 } },
+  { { BLOCK_16X64,   BLOCK_16X32 },   { BLOCK_INVALID, BLOCK_8X32 } },
+  { { BLOCK_64X16,   BLOCK_INVALID }, { BLOCK_32X16,   BLOCK_32X8 } }
+};
+/* clang-format on */
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index 6ab7af417..dfe927c6e 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -365,7 +365,6 @@ static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
   5,  // TX_64X16
 };
 
-/* clang-format off */
 static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_4X4,    // ONLY_4X4
   TX_64X64,  // TX_MODE_LARGEST
@@ -374,33 +373,7 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
 
 // The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
 // size function).
-static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
-  //  ss_x == 0      ss_x == 0          ss_x == 1      ss_x == 1
-  //  ss_y == 0      ss_y == 1          ss_y == 0      ss_y == 1
-  { { BLOCK_4X4,     BLOCK_4X4 },     { BLOCK_4X4,     BLOCK_4X4 } },
-  { { BLOCK_4X8,     BLOCK_4X4 },     { BLOCK_INVALID, BLOCK_4X4 } },
-  { { BLOCK_8X4,     BLOCK_INVALID }, { BLOCK_4X4,     BLOCK_4X4 } },
-  { { BLOCK_8X8,     BLOCK_8X4 },     { BLOCK_4X8,     BLOCK_4X4 } },
-  { { BLOCK_8X16,    BLOCK_8X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
-  { { BLOCK_16X8,    BLOCK_INVALID }, { BLOCK_8X8,     BLOCK_8X4 } },
-  { { BLOCK_16X16,   BLOCK_16X8 },    { BLOCK_8X16,    BLOCK_8X8 } },
-  { { BLOCK_16X32,   BLOCK_16X16 },   { BLOCK_INVALID, BLOCK_8X16 } },
-  { { BLOCK_32X16,   BLOCK_INVALID }, { BLOCK_16X16,   BLOCK_16X8 } },
-  { { BLOCK_32X32,   BLOCK_32X16 },   { BLOCK_16X32,   BLOCK_16X16 } },
-  { { BLOCK_32X64,   BLOCK_32X32 },   { BLOCK_INVALID, BLOCK_16X32 } },
-  { { BLOCK_64X32,   BLOCK_INVALID }, { BLOCK_32X32,   BLOCK_32X16 } },
-  { { BLOCK_64X64,   BLOCK_64X32 },   { BLOCK_32X64,   BLOCK_32X32 } },
-  { { BLOCK_64X128,  BLOCK_64X64 },   { BLOCK_INVALID, BLOCK_32X64 } },
-  { { BLOCK_128X64,  BLOCK_INVALID }, { BLOCK_64X64,   BLOCK_64X32 } },
-  { { BLOCK_128X128, BLOCK_128X64 },  { BLOCK_64X128,  BLOCK_64X64 } },
-  { { BLOCK_4X16,    BLOCK_4X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
-  { { BLOCK_16X4,    BLOCK_INVALID }, { BLOCK_8X4,     BLOCK_8X4 } },
-  { { BLOCK_8X32,    BLOCK_8X16 },    { BLOCK_INVALID, BLOCK_4X16 } },
-  { { BLOCK_32X8,    BLOCK_INVALID }, { BLOCK_16X8,    BLOCK_16X4 } },
-  { { BLOCK_16X64,   BLOCK_16X32 },   { BLOCK_INVALID, BLOCK_8X32 } },
-  { { BLOCK_64X16,   BLOCK_INVALID }, { BLOCK_32X16,   BLOCK_32X8 } }
-};
-/* clang-format on */
+extern const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2];
 
 // Generates 5 bit field in which each bit set to 1 represents
 // a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 63dda39da..54b2bb040 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -73,45 +73,6 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
-                               int dst_stride, int w, int h, int dir,
-                               double norm) {
-  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
-  DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
-  const int taps = 3;
-  int im_h = h + taps - 1;
-  int im_stride = w;
-  const int fo_vert = 1;
-  const int fo_horiz = 1;
-
-  // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
-  const int16_t *x_filter = dir ? sobel_a : sobel_b;
-  for (int y = 0; y < im_h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int16_t sum = 0;
-      for (int k = 0; k < taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
-      }
-      im_block[y * im_stride + x] = sum;
-    }
-  }
-
-  // vertical filter
-  int16_t *src_vert = im_block + fo_vert * im_stride;
-  const int16_t *y_filter = dir ? sobel_b : sobel_a;
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int16_t sum = 0;
-      for (int k = 0; k < taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
-      }
-      dst[y * dst_stride + x] = sum * norm;
-    }
-  }
-}
-
 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
                           const InterpFilterParams *filter_params_x,
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 5f3e59625..36c0c842b 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -72,12 +72,16 @@ static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
   conv_params.round_0 = ROUND0_BITS;
   conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
                                     : 2 * FILTER_BITS - conv_params.round_0;
+#if CONFIG_AV1_HIGHBITDEPTH
   const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
   assert(IMPLIES(bd < 12, intbufrange <= 16));
   if (intbufrange > 16) {
     conv_params.round_0 += intbufrange - 16;
     if (!is_compound) conv_params.round_1 -= intbufrange - 16;
   }
+#else
+  (void)bd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
   // TODO(yunqing): The following dst should only be valid while
   // is_compound = 1;
   conv_params.dst = dst;
@@ -122,11 +126,6 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    int scaled, ConvolveParams *conv_params,
                                    int bd);
 
-// TODO(sarahparker) This will need to be integerized and optimized
-void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
-                               int dst_stride, int w, int h, int dir,
-                               double norm);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 7582d54d4..49fc551fc 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -885,13 +885,11 @@ static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
                                                  { AOM_CDF3(16803, 22759) } },
                                              };
 
-#define MAX_COLOR_CONTEXT_HASH 8
 // Negative values are invalid
-static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
-                                                    1] = { -1, -1, 0, -1, -1,
-                                                           4,  3,  2, 1 };
+const int av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1] = {
+  -1, -1, 0, -1, -1, 4, 3, 2, 1
+};
 
-#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
 int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
                                         int r, int c, int palette_size,
                                         uint8_t *color_order, int *color_idx) {
@@ -963,120 +961,11 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
 
   // Lookup context from hash.
   const int color_index_ctx =
-      palette_color_index_context_lookup[color_index_ctx_hash];
-  assert(color_index_ctx >= 0);
-  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
-  return color_index_ctx;
-}
-
-int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride,
-                                         int r, int c, int *color_idx) {
-  assert(r > 0 || c > 0);
-
-  // This goes in the order of left, top, and top-left. This has the advantage
-  // that unless anything here are not distinct or invalid, this will already
-  // be in sorted order. Furthermore, if either of the first two are not
-  // invalid, we know the last one is also invalid.
-  int color_neighbors[NUM_PALETTE_NEIGHBORS];
-  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
-  color_neighbors[1] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
-  color_neighbors[2] =
-      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
-
-  // Since our array is so small, using a couple if statements is faster
-  int scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
-  if (color_neighbors[0] == color_neighbors[1]) {
-    scores[0] += scores[1];
-    color_neighbors[1] = -1;
-
-    if (color_neighbors[0] == color_neighbors[2]) {
-      scores[0] += scores[2];
-      color_neighbors[2] = -1;
-    }
-  } else if (color_neighbors[0] == color_neighbors[2]) {
-    scores[0] += scores[2];
-    color_neighbors[2] = -1;
-  } else if (color_neighbors[1] == color_neighbors[2]) {
-    scores[1] += scores[2];
-    color_neighbors[2] = -1;
-  }
-
-  int color_rank[NUM_PALETTE_NEIGHBORS] = { -1, -1, -1 };
-  int score_rank[NUM_PALETTE_NEIGHBORS] = { 0, 0, 0 };
-  int num_valid_colors = 0;
-  for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; idx++) {
-    if (color_neighbors[idx] != -1) {
-      score_rank[num_valid_colors] = scores[idx];
-      color_rank[num_valid_colors] = color_neighbors[idx];
-      num_valid_colors++;
-    }
-  }
-
-  // Sort everything
-  // We need to swap the first two elements if they have the same score but
-  // the color indices are not in the right order
-  if (score_rank[0] < score_rank[1] ||
-      (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
-    const int tmp_score = score_rank[0];
-    const int tmp_color = color_rank[0];
-    score_rank[0] = score_rank[1];
-    color_rank[0] = color_rank[1];
-    score_rank[1] = tmp_score;
-    color_rank[1] = tmp_color;
-  }
-  if (score_rank[0] < score_rank[2]) {
-    const int tmp_score = score_rank[0];
-    const int tmp_color = color_rank[0];
-    score_rank[0] = score_rank[2];
-    color_rank[0] = color_rank[2];
-    score_rank[2] = tmp_score;
-    color_rank[2] = tmp_color;
-  }
-  if (score_rank[1] < score_rank[2]) {
-    const int tmp_score = score_rank[1];
-    const int tmp_color = color_rank[1];
-    score_rank[1] = score_rank[2];
-    color_rank[1] = color_rank[2];
-    score_rank[2] = tmp_score;
-    color_rank[2] = tmp_color;
-  }
-
-  if (color_idx != NULL) {
-    // If any of the neighbor color has higher index than current color index,
-    // then we move up by 1 unless the current color is the same as one of the
-    // neighbor
-    const int current_color = *color_idx = color_map[r * stride + c];
-    int same_neighbor = -1;
-    for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; idx++) {
-      if (color_rank[idx] > current_color) {
-        (*color_idx)++;
-      } else if (color_rank[idx] == current_color) {
-        same_neighbor = idx;
-      }
-    }
-    if (same_neighbor != -1) {
-      *color_idx = same_neighbor;
-    }
-  }
-
-  // Get hash value of context.
-  int color_index_ctx_hash = 0;
-  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
-  for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; ++idx) {
-    color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
-  }
-  assert(color_index_ctx_hash > 0);
-  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
-
-  // Lookup context from hash.
-  const int color_index_ctx =
-      palette_color_index_context_lookup[color_index_ctx_hash];
+      av1_palette_color_index_context_lookup[color_index_ctx_hash];
   assert(color_index_ctx >= 0);
   assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
   return color_index_ctx;
 }
-#undef NUM_PALETTE_NEIGHBORS
-#undef MAX_COLOR_CONTEXT_HASH
 
 void av1_init_mode_probs(FRAME_CONTEXT *fc) {
   av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 59f249b11..d1b0df203 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -16,7 +16,6 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/filter.h"
 #include "av1/common/seg_common.h"
-#include "aom_dsp/aom_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +31,7 @@ extern "C" {
 // Number of possible contexts for a color index.
 // As can be seen from av1_get_palette_color_index_context(), the possible
 // contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to
-// a value from 0 to 4 using 'palette_color_index_context_lookup' table.
+// a value from 0 to 4 using 'av1_palette_color_index_context_lookup' table.
 #define PALETTE_COLOR_INDEX_CONTEXTS 5
 
 // Palette Y mode context for a block is determined by number of neighboring
@@ -56,6 +55,10 @@ extern "C" {
 // 4096(BLOCK_64X64)                        -> 6
 #define PALATTE_BSIZE_CTXS 7
 
+#define MAX_COLOR_CONTEXT_HASH 8
+
+#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
+
 #define KF_MODE_CONTEXTS 5
 
 struct AV1Common;
@@ -205,10 +208,8 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
                                         int r, int c, int palette_size,
                                         uint8_t *color_order, int *color_idx);
 
-// A faster version of av1_get_palette_color_index_context used by the encoder
-// exploiting the fact that the encoder does not need to maintain a color order.
-int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride,
-                                         int r, int c, int *color_idx);
+extern const int
+    av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1];
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/common/enums.h b/av1/common/enums.h
index eb655c9b9..1b952c481 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -454,8 +454,13 @@ enum {
   SEQ_LEVEL_7_1,
   SEQ_LEVEL_7_2,
   SEQ_LEVEL_7_3,
+  SEQ_LEVEL_8_0,
+  SEQ_LEVEL_8_1,
+  SEQ_LEVEL_8_2,
+  SEQ_LEVEL_8_3,
   SEQ_LEVELS,
-  SEQ_LEVEL_MAX = 31
+  SEQ_LEVEL_MAX = 31,
+  SEQ_LEVEL_KEEP_STATS = 32,
 } UENUM1BYTE(AV1_LEVEL);
 
 #define LEVEL_BITS 5
@@ -495,6 +500,7 @@ enum {
 #define DELTA_Q_PROBS (DELTA_Q_SMALL)
 #define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4
 #define DEFAULT_DELTA_Q_RES_OBJECTIVE 4
+#define DEFAULT_DELTA_Q_RES_DUCKY_ENCODE 4
 
 #define DELTA_LF_SMALL 3
 #define DELTA_LF_PROBS (DELTA_LF_SMALL)
diff --git a/av1/common/filter.h b/av1/common/filter.h
index ded5ce5ae..4344aea91 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -294,6 +294,25 @@ static INLINE uint8_t get_interp_filter_allowed_mask(
   return (allow_interp_mask >> filt_type) & 1;
 }
 
+static AOM_INLINE int get_filter_tap(
+    const InterpFilterParams *const filter_params, int subpel_qn) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_qn & SUBPEL_MASK);
+  if (filter_params->taps == 12) {
+    return 12;
+  }
+  if (filter[0] | filter[7]) {
+    return 8;
+  }
+  if (filter[1] | filter[6]) {
+    return 6;
+  }
+  if (filter[2] | filter[5]) {
+    return 4;
+  }
+  return 2;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/mv.h b/av1/common/mv.h
index c7eaf76d0..a61287b02 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -17,6 +17,7 @@
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -107,16 +108,6 @@ static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
 
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
-/* clang-format off */
-enum {
-  IDENTITY = 0,      // identity transformation, 0-parameter
-  TRANSLATION = 1,   // translational motion 2-parameter
-  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
-  AFFINE = 3,        // affine, 6-parameter
-  TRANS_TYPES,
-} UENUM1BYTE(TransformationType);
-/* clang-format on */
-
 // Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
 // The following can be useful:
 // GLOBAL_TRANS_TYPES 3 - up to rotation-zoom
@@ -130,9 +121,6 @@ typedef struct {
   int local_warp_allowed;
 } WarpTypesAllowed;
 
-// number of parameters used by each transformation in TransformationTypes
-static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
-
 // The order of values in the wmmat matrix below is best described
 // by the homography:
 //      [x'     (m2 m3 m0   [x
@@ -282,6 +270,17 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
     // After the right shifts, there are 3 fractional bits of precision. If
     // allow_hp is false, the bottom bit is always zero (so we don't need a
     // call to convert_to_trans_prec here)
+    //
+    // Note: There is an AV1 specification bug here:
+    //
+    // gm->wmmat[0] is supposed to be the horizontal translation, and so should
+    // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical
+    // translation and so should go into res.as_mv.row
+    //
+    // However, in the spec, these assignments are accidentally reversed, and so
+    // we must keep this incorrect logic to match the spec.
+    //
+    // See also: https://crbug.com/aomedia/3328
     res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
     res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
     assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index 6ad7ba3cd..361a4078d 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h
@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_COMMON_PRED_COMMON_H_
 #define AOM_AV1_COMMON_PRED_COMMON_H_
 
+#include <stdint.h>
+
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
@@ -21,34 +23,36 @@
 extern "C" {
 #endif
 
-static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params,
-                                 const uint8_t *segment_ids, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col) {
+static INLINE uint8_t get_segment_id(
+    const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids,
+    BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
   const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
-  int segment_id = MAX_SEGMENTS;
+  const int seg_stride = mi_params->mi_cols;
+  uint8_t segment_id = MAX_SEGMENTS;
 
   for (int y = 0; y < ymis; ++y) {
     for (int x = 0; x < xmis; ++x) {
-      segment_id = AOMMIN(segment_id,
-                          segment_ids[mi_offset + y * mi_params->mi_cols + x]);
+      segment_id =
+          AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]);
     }
   }
 
-  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  assert(segment_id < MAX_SEGMENTS);
   return segment_id;
 }
 
-static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
-                                           const MACROBLOCKD *const xd,
-                                           int *cdf_index, int skip_over4x4) {
+static INLINE uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+                                               const MACROBLOCKD *const xd,
+                                               int *cdf_index,
+                                               int skip_over4x4) {
   const int step_size = skip_over4x4 ? 2 : 1;
-  int prev_ul = -1;  // top left segment_id
-  int prev_l = -1;   // left segment_id
-  int prev_u = -1;   // top segment_id
+  uint8_t prev_ul = UINT8_MAX;  // top left segment_id
+  uint8_t prev_l = UINT8_MAX;   // left segment_id
+  uint8_t prev_u = UINT8_MAX;   // top segment_id
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -65,13 +69,11 @@ static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
     prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0,
                             mi_col - step_size);
   }
-  // This property follows from the fact that get_segment_id() returns a
-  // nonnegative value. This allows us to test for all edge cases with a simple
-  // prev_ul < 0 check.
-  assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0));
+  assert(IMPLIES(prev_ul != UINT8_MAX,
+                 prev_u != UINT8_MAX && prev_l != UINT8_MAX));
 
   // Pick CDF index based on number of matching/out-of-bounds segment IDs.
-  if (prev_ul < 0) /* Edge cases */
+  if (prev_ul == UINT8_MAX) /* Edge cases */
     *cdf_index = 0;
   else if ((prev_ul == prev_u) && (prev_ul == prev_l))
     *cdf_index = 2;
@@ -81,14 +83,14 @@ static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
     *cdf_index = 0;
 
   // If 2 or more are identical returns that as predictor, otherwise prev_l.
-  if (prev_u == -1)  // edge case
-    return prev_l == -1 ? 0 : prev_l;
-  if (prev_l == -1)  // edge case
+  if (prev_u == UINT8_MAX)  // edge case
+    return prev_l == UINT8_MAX ? 0 : prev_l;
+  if (prev_l == UINT8_MAX)  // edge case
     return prev_u;
   return (prev_ul == prev_u) ? prev_u : prev_l;
 }
 
-static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+static INLINE uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 5dc1de228..11fd257bb 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -57,44 +57,6 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
   return 0;
 }
 
-void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
-                           int block_height, int pix_row, int pix_col,
-                           int subsampling_x, int subsampling_y, int bit_depth,
-                           int use_hbd_buf, int is_intrabc,
-                           const struct scale_factors *sf,
-                           const struct buf_2d *ref_buf,
-                           int_interpfilters interp_filters) {
-  inter_pred_params->block_width = block_width;
-  inter_pred_params->block_height = block_height;
-  inter_pred_params->pix_row = pix_row;
-  inter_pred_params->pix_col = pix_col;
-  inter_pred_params->subsampling_x = subsampling_x;
-  inter_pred_params->subsampling_y = subsampling_y;
-  inter_pred_params->bit_depth = bit_depth;
-  inter_pred_params->use_hbd_buf = use_hbd_buf;
-  inter_pred_params->is_intrabc = is_intrabc;
-  inter_pred_params->scale_factors = sf;
-  inter_pred_params->ref_frame_buf = *ref_buf;
-  inter_pred_params->mode = TRANSLATION_PRED;
-  inter_pred_params->comp_mode = UNIFORM_SINGLE;
-
-  if (is_intrabc) {
-    inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params;
-    inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params;
-  } else {
-    inter_pred_params->interp_filter_params[0] =
-        av1_get_interp_filter_params_with_block_size(
-            interp_filters.as_filters.x_filter, block_width);
-    inter_pred_params->interp_filter_params[1] =
-        av1_get_interp_filter_params_with_block_size(
-            interp_filters.as_filters.y_filter, block_height);
-  }
-}
-
-void av1_init_comp_mode(InterPredParams *inter_pred_params) {
-  inter_pred_params->comp_mode = UNIFORM_COMP;
-}
-
 void av1_init_warp_params(InterPredParams *inter_pred_params,
                           const WarpTypesAllowed *warp_types, int ref,
                           const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
@@ -656,10 +618,10 @@ static AOM_INLINE void build_masked_compound_no_round(
 #endif
 }
 
-static void make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
-                                        uint8_t *dst, int dst_stride,
-                                        InterPredParams *inter_pred_params,
-                                        const SubpelParams *subpel_params) {
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params) {
   const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
   BLOCK_SIZE sb_type = inter_pred_params->sb_type;
 
@@ -696,26 +658,6 @@ static void make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
       inter_pred_params->block_width, inter_pred_params);
 }
 
-void av1_build_one_inter_predictor(
-    uint8_t *dst, int dst_stride, const MV *const src_mv,
-    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, uint8_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func) {
-  SubpelParams subpel_params;
-  uint8_t *src;
-  int src_stride;
-  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref,
-                          mc_buf, &src, &subpel_params, &src_stride);
-
-  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
-      inter_pred_params->comp_mode == UNIFORM_COMP) {
-    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
-                             inter_pred_params, &subpel_params);
-  } else {
-    make_masked_inter_predictor(src, src_stride, dst, dst_stride,
-                                inter_pred_params, &subpel_params);
-  }
-}
-
 void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
                                      const MB_MODE_INFO *mbmi, int *fwd_offset,
                                      int *bck_offset,
@@ -766,195 +708,6 @@ void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
   *bck_offset = quant_dist_lookup_table[i][1 - order];
 }
 
-// True if the following hold:
-//  1. Not intrabc and not build_for_obmc
-//  2. At least one dimension is size 4 with subsampling
-//  3. If sub-sampled, none of the previous blocks around the sub-sample
-//     are intrabc or inter-blocks
-static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
-                            int is_intrabc, int build_for_obmc) {
-  if (is_intrabc || build_for_obmc) {
-    return false;
-  }
-
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
-  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
-  if (!is_sub4_x && !is_sub4_y) {
-    return false;
-  }
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start = is_sub4_y ? -1 : 0;
-  const int col_start = is_sub4_x ? -1 : 0;
-
-  for (int row = row_start; row <= 0; ++row) {
-    for (int col = col_start; col <= 0; ++col) {
-      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-      if (!is_inter_block(this_mbmi)) return false;
-      if (is_intrabc_block(this_mbmi)) return false;
-    }
-  }
-  return true;
-}
-
-static void build_inter_predictors_sub8x8(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int mi_x, int mi_y, uint8_t **mc_buf,
-    CalcSubpelParamsFunc calc_subpel_params_func) {
-  const BLOCK_SIZE bsize = mi->bsize;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const bool ss_x = pd->subsampling_x;
-  const bool ss_y = pd->subsampling_y;
-  const int b4_w = block_size_wide[bsize] >> ss_x;
-  const int b4_h = block_size_high[bsize] >> ss_y;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
-  const int b8_w = block_size_wide[plane_bsize];
-  const int b8_h = block_size_high[plane_bsize];
-  const int is_compound = has_second_ref(mi);
-  assert(!is_compound);
-  assert(!is_intrabc_block(mi));
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  int row = row_start;
-  for (int y = 0; y < b8_h; y += b4_h) {
-    int col = col_start;
-    for (int x = 0; x < b8_w; x += b4_w) {
-      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-      struct buf_2d *const dst_buf = &pd->dst;
-      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-      int ref = 0;
-      const RefCntBuffer *ref_buf =
-          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
-      const struct scale_factors *ref_scale_factors =
-          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
-      const struct scale_factors *const sf = ref_scale_factors;
-      const struct buf_2d pre_buf = {
-        NULL,
-        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
-        ref_buf->buf.uv_crop_width,
-        ref_buf->buf.uv_crop_height,
-        ref_buf->buf.uv_stride,
-      };
-
-      const MV mv = this_mbmi->mv[ref].as_mv;
-
-      InterPredParams inter_pred_params;
-      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
-                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
-                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
-                            &pre_buf, this_mbmi->interp_filters);
-      inter_pred_params.conv_params =
-          get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
-
-      av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
-                                    &inter_pred_params, xd, mi_x + x, mi_y + y,
-                                    ref, mc_buf, calc_subpel_params_func);
-
-      ++col;
-    }
-    ++row;
-  }
-}
-
-static void build_inter_predictors_8x8_and_bigger(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf,
-    CalcSubpelParamsFunc calc_subpel_params_func) {
-  const int is_compound = has_second_ref(mi);
-  const int is_intrabc = is_intrabc_block(mi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  struct buf_2d *const dst_buf = &pd->dst;
-  uint8_t *const dst = dst_buf->buf;
-
-  int is_global[2] = { 0, 0 };
-  for (int ref = 0; ref < 1 + is_compound; ++ref) {
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
-  }
-
-  const BLOCK_SIZE bsize = mi->bsize;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  for (int ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
-    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-    const MV mv = mi->mv[ref].as_mv;
-    const WarpTypesAllowed warp_types = { is_global[ref],
-                                          mi->motion_mode == WARPED_CAUSAL };
-
-    InterPredParams inter_pred_params;
-    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
-                          pd->subsampling_x, pd->subsampling_y, xd->bd,
-                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
-                          mi->interp_filters);
-    if (is_compound) av1_init_comp_mode(&inter_pred_params);
-    inter_pred_params.conv_params = get_conv_params_no_round(
-        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
-
-    av1_dist_wtd_comp_weight_assign(
-        cm, mi, &inter_pred_params.conv_params.fwd_offset,
-        &inter_pred_params.conv_params.bck_offset,
-        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
-
-    if (!build_for_obmc)
-      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
-
-    if (is_masked_compound_type(mi->interinter_comp.type)) {
-      inter_pred_params.sb_type = mi->bsize;
-      inter_pred_params.mask_comp = mi->interinter_comp;
-      if (ref == 1) {
-        inter_pred_params.conv_params.do_average = 0;
-        inter_pred_params.comp_mode = MASK_COMP;
-      }
-      // Assign physical buffer.
-      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
-    }
-
-    av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
-                                  xd, mi_x, mi_y, ref, mc_buf,
-                                  calc_subpel_params_func);
-  }
-}
-
-void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                int plane, const MB_MODE_INFO *mi,
-                                int build_for_obmc, int bw, int bh, int mi_x,
-                                int mi_y, uint8_t **mc_buf,
-                                CalcSubpelParamsFunc calc_subpel_params_func) {
-  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
-                      build_for_obmc)) {
-    assert(bw < 8 || bh < 8);
-    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf,
-                                  calc_subpel_params_func);
-  } else {
-    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
-                                          bh, mi_x, mi_y, mc_buf,
-                                          calc_subpel_params_func);
-  }
-}
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const int plane_start, const int plane_end) {
@@ -1033,7 +786,7 @@ static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
   (void)op_mi_size;
   (void)dir;
   (void)mi;
-  ++*(int *)fun_ctxt;
+  ++*(uint8_t *)fun_ctxt;
   (void)num_planes;
 }
 
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 056dc67d0..da7b84a8e 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -79,6 +79,8 @@ typedef struct SubpelParams {
   int ys;
   int subpel_x;
   int subpel_y;
+  int pos_x;
+  int pos_y;
 } SubpelParams;
 
 struct build_prediction_ctxt {
@@ -121,17 +123,112 @@ typedef struct InterPredParams {
   INTERINTER_COMPOUND_DATA mask_comp;
   BLOCK_SIZE sb_type;
   int is_intrabc;
+  int top;
+  int left;
 } InterPredParams;
 
-void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
-                           int block_height, int pix_row, int pix_col,
-                           int subsampling_x, int subsampling_y, int bit_depth,
-                           int use_hbd_buf, int is_intrabc,
-                           const struct scale_factors *sf,
-                           const struct buf_2d *ref_buf,
-                           int_interpfilters interp_filters);
+// Initialize sub-pel params required for inter prediction.
+static AOM_INLINE void init_subpel_params(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    SubpelParams *subpel_params, int width, int height) {
+  const struct scale_factors *sf = inter_pred_params->scale_factors;
+  int ssx = inter_pred_params->subsampling_x;
+  int ssy = inter_pred_params->subsampling_y;
+  int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+  orig_pos_y += src_mv->row * (1 << (1 - ssy));
+  int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+  orig_pos_x += src_mv->col * (1 << (1 - ssx));
+  const int is_scaled = av1_is_scaled(sf);
+  int pos_x, pos_y;
+  if (LIKELY(!is_scaled)) {
+    pos_y = av1_unscaled_value(orig_pos_y, sf);
+    pos_x = av1_unscaled_value(orig_pos_x, sf);
+  } else {
+    pos_y = av1_scaled_y(orig_pos_y, sf);
+    pos_x = av1_scaled_x(orig_pos_x, sf);
+  }
+
+  pos_x += SCALE_EXTRA_OFF;
+  pos_y += SCALE_EXTRA_OFF;
+
+  const int bottom = (height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+  const int right = (width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+  pos_y = clamp(pos_y, inter_pred_params->top, bottom);
+  pos_x = clamp(pos_x, inter_pred_params->left, right);
+
+  subpel_params->pos_x = pos_x;
+  subpel_params->pos_y = pos_y;
+  subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+  subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+  subpel_params->xs = sf->x_step_q4;
+  subpel_params->ys = sf->y_step_q4;
+}
+
+// Initialize interp filter required for inter prediction.
+static AOM_INLINE void init_interp_filter_params(
+    const InterpFilterParams *interp_filter_params[2],
+    const InterpFilters *filter, int block_width, int block_height,
+    int is_intrabc) {
+  if (UNLIKELY(is_intrabc)) {
+    interp_filter_params[0] = &av1_intrabc_filter_params;
+    interp_filter_params[1] = &av1_intrabc_filter_params;
+  } else {
+    interp_filter_params[0] = av1_get_interp_filter_params_with_block_size(
+        (InterpFilter)filter->x_filter, block_width);
+    interp_filter_params[1] = av1_get_interp_filter_params_with_block_size(
+        (InterpFilter)filter->y_filter, block_height);
+  }
+}
+
+// Initialize parameters required for inter prediction at mode level.
+static AOM_INLINE void init_inter_mode_params(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    SubpelParams *subpel_params, const struct scale_factors *sf, int width,
+    int height) {
+  inter_pred_params->scale_factors = sf;
+  init_subpel_params(src_mv, inter_pred_params, subpel_params, width, height);
+}
+
+// Initialize parameters required for inter prediction at block level.
+static AOM_INLINE void init_inter_block_params(
+    InterPredParams *inter_pred_params, int block_width, int block_height,
+    int pix_row, int pix_col, int subsampling_x, int subsampling_y,
+    int bit_depth, int use_hbd_buf, int is_intrabc) {
+  inter_pred_params->block_width = block_width;
+  inter_pred_params->block_height = block_height;
+  inter_pred_params->pix_row = pix_row;
+  inter_pred_params->pix_col = pix_col;
+  inter_pred_params->subsampling_x = subsampling_x;
+  inter_pred_params->subsampling_y = subsampling_y;
+  inter_pred_params->bit_depth = bit_depth;
+  inter_pred_params->use_hbd_buf = use_hbd_buf;
+  inter_pred_params->is_intrabc = is_intrabc;
+  inter_pred_params->mode = TRANSLATION_PRED;
+  inter_pred_params->comp_mode = UNIFORM_SINGLE;
+  inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y);
+  inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x);
+}
 
-void av1_init_comp_mode(InterPredParams *inter_pred_params);
+// Initialize params required for inter prediction.
+static AOM_INLINE void av1_init_inter_params(
+    InterPredParams *inter_pred_params, int block_width, int block_height,
+    int pix_row, int pix_col, int subsampling_x, int subsampling_y,
+    int bit_depth, int use_hbd_buf, int is_intrabc,
+    const struct scale_factors *sf, const struct buf_2d *ref_buf,
+    int_interpfilters interp_filters) {
+  init_inter_block_params(inter_pred_params, block_width, block_height, pix_row,
+                          pix_col, subsampling_x, subsampling_y, bit_depth,
+                          use_hbd_buf, is_intrabc);
+  init_interp_filter_params(inter_pred_params->interp_filter_params,
+                            &interp_filters.as_filters, block_width,
+                            block_height, is_intrabc);
+  inter_pred_params->scale_factors = sf;
+  inter_pred_params->ref_frame_buf = *ref_buf;
+}
+
+static AOM_INLINE void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+  inter_pred_params->comp_mode = UNIFORM_COMP;
+}
 
 void av1_init_warp_params(InterPredParams *inter_pred_params,
                           const WarpTypesAllowed *warp_types, int ref,
@@ -235,24 +332,10 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                               int dst_stride,
                               InterPredParams *inter_pred_params,
                               const SubpelParams *subpel_params);
-
-typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
-                                     InterPredParams *const inter_pred_params,
-                                     MACROBLOCKD *xd, int mi_x, int mi_y,
-                                     int ref, uint8_t **mc_buf, uint8_t **pre,
-                                     SubpelParams *subpel_params,
-                                     int *src_stride);
-
-void av1_build_one_inter_predictor(
-    uint8_t *dst, int dst_stride, const MV *const src_mv,
-    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, uint8_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func);
-
-void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                int plane, const MB_MODE_INFO *mi,
-                                int build_for_obmc, int bw, int bh, int mi_x,
-                                int mi_y, uint8_t **mc_buf,
-                                CalcSubpelParamsFunc calc_subpel_params_func);
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
@@ -284,10 +367,17 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
 static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
                                            int stride,
                                            const struct scale_factors *sf) {
-  const int x =
-      sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
-  const int y =
-      sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
+  int x, y;
+  if (!sf) {
+    x = x_offset;
+    y = y_offset;
+  } else if (av1_is_scaled(sf)) {
+    x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS;
+    y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS;
+  } else {
+    x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS;
+    y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS;
+  }
   return (int64_t)y * stride + x;
 }
 
diff --git a/av1/common/reconinter_template.inc b/av1/common/reconinter_template.inc
new file mode 100644
index 000000000..863c13c11
--- /dev/null
+++ b/av1/common/reconinter_template.inc
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef IS_DEC
+#error "IS_DEC must be defined for reconinter_template.inc."
+#endif
+
+#if IS_DEC
+static AOM_INLINE void build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, uint8_t **mc_buf) {
+#else
+static AOM_INLINE void build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *src_mv,
+    InterPredParams *inter_pred_params) {
+#endif  // IS_DEC
+  SubpelParams subpel_params;
+  uint8_t *src;
+  int src_stride;
+#if IS_DEC
+  dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y,
+                                    ref, mc_buf, &src, &subpel_params,
+                                    &src_stride);
+#else
+  enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params,
+                         &src_stride);
+#endif  // IS_DEC
+  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
+      inter_pred_params->comp_mode == UNIFORM_COMP) {
+    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
+                             inter_pred_params, &subpel_params);
+  } else {
+    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+                                    inter_pred_params, &subpel_params);
+  }
+}
+
+// True if the following hold:
+//  1. Not intrabc and not build_for_obmc
+//  2. At least one dimension is size 4 with subsampling
+//  3. If sub-sampled, none of the previous blocks around the sub-sample
+//     are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+                            int is_intrabc, int build_for_obmc) {
+  if (is_intrabc || build_for_obmc) {
+    return false;
+  }
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
+  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
+  if (!is_sub4_x && !is_sub4_y) {
+    return false;
+  }
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = is_sub4_y ? -1 : 0;
+  const int col_start = is_sub4_x ? -1 : 0;
+
+  for (int row = row_start; row <= 0; ++row) {
+    for (int col = col_start; col <= 0; ++col) {
+      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      if (!is_inter_block(this_mbmi)) return false;
+      if (is_intrabc_block(this_mbmi)) return false;
+    }
+  }
+  return true;
+}
+
+#if IS_DEC
+static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
+                                                     MACROBLOCKD *xd, int plane,
+                                                     const MB_MODE_INFO *mi,
+                                                     int mi_x, int mi_y,
+                                                     uint8_t **mc_buf) {
+#else
+static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
+                                                     MACROBLOCKD *xd, int plane,
+                                                     const MB_MODE_INFO *mi,
+                                                     int mi_x, int mi_y) {
+#endif  // IS_DEC
+  const BLOCK_SIZE bsize = mi->bsize;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const bool ss_x = pd->subsampling_x;
+  const bool ss_y = pd->subsampling_y;
+  const int b4_w = block_size_wide[bsize] >> ss_x;
+  const int b4_h = block_size_high[bsize] >> ss_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int b8_w = block_size_wide[plane_bsize];
+  const int b8_h = block_size_high[plane_bsize];
+  const int is_compound = has_second_ref(mi);
+  assert(!is_compound);
+  assert(!is_intrabc_block(mi));
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  int row = row_start;
+  for (int y = 0; y < b8_h; y += b4_h) {
+    int col = col_start;
+    for (int x = 0; x < b8_w; x += b4_w) {
+      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      struct buf_2d *const dst_buf = &pd->dst;
+      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+      int ref = 0;
+      const RefCntBuffer *ref_buf =
+          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *const sf = ref_scale_factors;
+      const struct buf_2d pre_buf = {
+        NULL,
+        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+        ref_buf->buf.uv_crop_width,
+        ref_buf->buf.uv_crop_height,
+        ref_buf->buf.uv_stride,
+      };
+
+      const MV mv = this_mbmi->mv[ref].as_mv;
+
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
+                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+                            &pre_buf, this_mbmi->interp_filters);
+      inter_pred_params.conv_params =
+          get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
+
+#if IS_DEC
+      build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+                                xd, mi_x + x, mi_y + y, ref, mc_buf);
+#else
+      build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
+#endif  // IS_DEC
+
+      ++col;
+    }
+    ++row;
+  }
+}
+
+#if IS_DEC
+static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
+#else
+static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y) {
+#endif  // IS_DEC
+  const int is_compound = has_second_ref(mi);
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+
+  int is_global[2] = { 0, 0 };
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->bsize;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+    const MV mv = mi->mv[ref].as_mv;
+    const WarpTypesAllowed warp_types = { is_global[ref],
+                                          mi->motion_mode == WARPED_CAUSAL };
+
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+                          pd->subsampling_x, pd->subsampling_y, xd->bd,
+                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+                          mi->interp_filters);
+    if (is_compound) av1_init_comp_mode(&inter_pred_params);
+    inter_pred_params.conv_params = get_conv_params_no_round(
+        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, &inter_pred_params.conv_params.fwd_offset,
+        &inter_pred_params.conv_params.bck_offset,
+        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+    if (!build_for_obmc)
+      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    if (is_masked_compound_type(mi->interinter_comp.type)) {
+      inter_pred_params.sb_type = mi->bsize;
+      inter_pred_params.mask_comp = mi->interinter_comp;
+      if (ref == 1) {
+        inter_pred_params.conv_params.do_average = 0;
+        inter_pred_params.comp_mode = MASK_COMP;
+      }
+      // Assign physical buffer.
+      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+    }
+
+#if IS_DEC
+    build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd,
+                              mi_x, mi_y, ref, mc_buf);
+#else
+    build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
+#endif  // IS_DEC
+  }
+}
+
+#if IS_DEC
+static AOM_INLINE void build_inter_predictors(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
+  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
+                      build_for_obmc)) {
+    assert(bw < 8 || bh < 8);
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf);
+  } else {
+    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+                                          bh, mi_x, mi_y, mc_buf);
+  }
+}
+#else
+static AOM_INLINE void build_inter_predictors(const AV1_COMMON *cm,
+                                              MACROBLOCKD *xd, int plane,
+                                              const MB_MODE_INFO *mi,
+                                              int build_for_obmc, int bw,
+                                              int bh, int mi_x, int mi_y) {
+  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
+                      build_for_obmc)) {
+    assert(bw < 8 || bh < 8);
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y);
+  } else {
+    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+                                          bh, mi_x, mi_y);
+  }
+}
+#endif  // IS_DEC
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 226294583..242930cb8 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -1207,38 +1207,47 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
                                    const InterpFilter filter,
                                    const int phase_scaler,
                                    const int num_planes) {
-  const int src_w = src->y_crop_width;
-  const int src_h = src->y_crop_height;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
   assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH ||
          filter == EIGHTTAP_REGULAR);
   const InterpKernel *const kernel =
-      filter == BILINEAR ? av1_bilinear_filters : av1_sub_pel_filters_8smooth;
-  const int dst_w = dst->y_crop_width;
-  const int dst_h = dst->y_crop_height;
+      (const InterpKernel *)av1_interp_filter_params_list[filter].filter_ptr;
+
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
-    const int factor = (i == 0 || i == 3 ? 1 : 2);
-    const int src_stride = src_strides[i];
-    const int dst_stride = dst_strides[i];
+    const int is_uv = i > 0;
+    const int src_w = src->crop_widths[is_uv];
+    const int src_h = src->crop_heights[is_uv];
+    const uint8_t *src_buffer = src->buffers[i];
+    const int src_stride = src->strides[is_uv];
+    const int dst_w = dst->crop_widths[is_uv];
+    const int dst_h = dst->crop_heights[is_uv];
+    uint8_t *dst_buffer = dst->buffers[i];
+    const int dst_stride = dst->strides[is_uv];
     for (int y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+      const int y_q4 = y * 16 * src_h / dst_h + phase_scaler;
       for (int x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
-        const uint8_t *src_ptr = srcs[i] +
-                                 (y / factor) * src_h / dst_h * src_stride +
-                                 (x / factor) * src_w / dst_w;
-        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
-
-        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
-                      x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
-                      16 * src_h / dst_h, 16 / factor, 16 / factor);
+        const int x_q4 = x * 16 * src_w / dst_w + phase_scaler;
+        const uint8_t *src_ptr =
+            src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w;
+        uint8_t *dst_ptr = dst_buffer + y * dst_stride + x;
+
+        // Width and height of the actual working area.
+        const int work_w = AOMMIN(16, dst_w - x);
+        const int work_h = AOMMIN(16, dst_h - y);
+        // SIMD versions of aom_scaled_2d() have some trouble handling
+        // nonstandard sizes, so fall back on the C version to handle borders.
+        if (work_w != 16 || work_h != 16) {
+          aom_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                          x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                          16 * src_h / dst_h, work_w, work_h);
+        } else {
+          aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16, 16);
+        }
       }
     }
   }
+  aom_extend_frame_borders(dst, num_planes);
 }
 
 void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
@@ -1384,9 +1393,16 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
       aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate scaled buffer");
 
-    const bool has_optimized_scaler = av1_has_optimized_scaler(
+    bool has_optimized_scaler = av1_has_optimized_scaler(
         unscaled->y_crop_width, unscaled->y_crop_height, scaled_width,
         scaled_height);
+    if (num_planes > 1) {
+      has_optimized_scaler = has_optimized_scaler &&
+                             av1_has_optimized_scaler(unscaled->uv_crop_width,
+                                                      unscaled->uv_crop_height,
+                                                      scaled->uv_crop_width,
+                                                      scaled->uv_crop_height);
+    }
 
 #if CONFIG_AV1_HIGHBITDEPTH
     if (use_optimized_scaler && has_optimized_scaler &&
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 9bc23b3ff..4e8ee0fe9 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -102,23 +102,33 @@ static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
   // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   // So, the following check is more accurate.
-  return !(cm->width == cm->superres_upscaled_width);
+  return (cm->width != cm->superres_upscaled_width);
 }
 
-// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling.
-// SSSE3 also has optimizations for 2x upscaling.
-// Use non normative scalers for other scaling ratios.
+// The optimized scaler av1_resize_and_extend_frame() can only handle scaling
+// ratios >= 1/4 and <= 16. See comment in aom_convolve8_c() for detail.
+// Visual assessment shows that if the scaling ratio or its reciprocal is not a
+// multiple of 1/16, there are some artifacts in the output of the optimized
+// scaler, especially on lines, due to non-exact ratio representation. SSSE3
+// and NEON have a specialized 3/4 version of av1_resize_and_extend_frame()
+// that does not have this issue.
+//
+// Use the non-normative scaler av1_resize_and_extend_frame_nonnormative()
+// for other scaling ratios.
 static INLINE bool av1_has_optimized_scaler(const int src_width,
                                             const int src_height,
                                             const int dst_width,
                                             const int dst_height) {
-  const bool has_optimized_scaler =
-      (dst_width * 4 == src_width && dst_height * 4 == src_height) ||
-      (dst_width * 2 == src_width && dst_height * 2 == src_height) ||
-      (dst_width * 4 == src_width * 3 && dst_height * 4 == src_height * 3);
-#if HAVE_SSSE3
-  return has_optimized_scaler ||
-         (dst_width == src_width * 2 && dst_height == src_height * 2);
+  bool has_optimized_scaler =
+      (dst_width * 4 >= src_width && dst_height * 4 >= src_height) &&
+      (dst_width <= src_width * 16 && dst_height <= src_height * 16) &&
+      (16 * dst_width % src_width == 0) && (16 * src_width % dst_width == 0) &&
+      (16 * dst_height % src_height == 0) &&
+      (16 * src_height % dst_height == 0);
+#if HAVE_SSSE3 || HAVE_NEON
+  has_optimized_scaler =
+      has_optimized_scaler ||
+      (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height);
 #endif
   return has_optimized_scaler;
 }
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index dbfd1cc0d..822f24094 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -39,8 +39,8 @@ const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
 };
 
-AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
-  AV1PixelRect rect;
+PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
+  PixelRect rect;
 
   int ss_x = is_uv && cm->seq_params->subsampling_x;
   int ss_y = is_uv && cm->seq_params->subsampling_y;
@@ -70,7 +70,7 @@ void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
   // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
   // to do the computation ourselves, iterating over the tiles and keeping
   // track of the largest width and height, then upscaling.
-  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
   const int max_tile_w = tile_rect.right - tile_rect.left;
   const int max_tile_h = tile_rect.bottom - tile_rect.top;
 
@@ -249,7 +249,7 @@ static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
 // av1_loop_restoration_save_boundary_lines() function, so here we just need
 // to decide if we're overwriting the above/below boundary pixels or not.
 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
-                                     const AV1PixelRect *tile_rect, int ss_y,
+                                     const PixelRect *tile_rect, int ss_y,
                                      int *copy_above, int *copy_below) {
   *copy_above = 1;
   *copy_below = 1;
@@ -1024,7 +1024,7 @@ static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
 void av1_loop_restoration_filter_unit(
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
-    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    const PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
     int dst_stride, int32_t *tmpbuf, int optimized_lr) {
   RestorationType unit_rtype = rui->restoration_type;
@@ -1090,8 +1090,8 @@ void av1_loop_restoration_filter_unit(
 }
 
 static void filter_frame_on_unit(const RestorationTileLimits *limits,
-                                 const AV1PixelRect *tile_rect,
-                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 const PixelRect *tile_rect, int rest_unit_idx,
+                                 void *priv, int32_t *tmpbuf,
                                  RestorationLineBuffers *rlbs) {
   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
   const RestorationInfo *rsi = ctxt->rsi;
@@ -1117,7 +1117,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
   if (aom_realloc_frame_buffer(
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
-          cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) < 0)
+          cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) != AOM_CODEC_OK)
     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
@@ -1166,7 +1166,7 @@ void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
   assert(num_planes <= 3);
   for (int plane = 0; plane < num_planes; ++plane) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
-    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+    PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
     copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
                      tile_rect.right, tile_rect.top, tile_rect.bottom);
   }
@@ -1204,7 +1204,7 @@ void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
 }
 
 void av1_foreach_rest_unit_in_row(
-    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    RestorationTileLimits *limits, const PixelRect *tile_rect,
     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
     int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
     void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
@@ -1259,7 +1259,7 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
 }
 
 static void foreach_rest_unit_in_tile(
-    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+    const PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
     int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
     int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
     int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
@@ -1295,7 +1295,7 @@ static void foreach_rest_unit_in_tile(
 
 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     rest_unit_visitor_t on_rest_unit,
-                                    void *priv, AV1PixelRect *tile_rect,
+                                    void *priv, PixelRect *tile_rect,
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs) {
   const int is_uv = plane > 0;
@@ -1322,7 +1322,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
 
   const int is_uv = plane > 0;
 
-  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
   const int tile_w = tile_rect.right - tile_rect.left;
   const int tile_h = tile_rect.bottom - tile_rect.top;
 
@@ -1500,7 +1500,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
 
   // Get the tile rectangle, with height rounded up to the next multiple of 8
   // luma pixels (only relevant for the bottom tile of the frame)
-  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
   const int stripe0 = 0;
 
   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 65ccd0900..bf2130341 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -332,7 +332,7 @@ typedef struct {
 } RestorationTileLimits;
 
 typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
-                                    const AV1PixelRect *tile_rect,
+                                    const PixelRect *tile_rect,
                                     int rest_unit_idx, void *priv,
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs);
@@ -344,7 +344,7 @@ typedef struct FilterFrameCtxt {
   int highbd, bit_depth;
   uint8_t *data8, *dst8;
   int data_stride, dst_stride;
-  AV1PixelRect tile_rect;
+  PixelRect tile_rect;
 } FilterFrameCtxt;
 
 typedef struct AV1LrStruct {
@@ -397,13 +397,13 @@ void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
  *                           should be at least SGRPROJ_TMPBUF_SIZE big.
  * \param[in]  optimized_lr  Whether to use fast optimized Loop Restoration
  *
- * \return Nothing is returned. Instead, the filtered unit is output in
+ * \remark Nothing is returned. Instead, the filtered unit is output in
  * \c dst8 at the proper restoration unit offset.
  */
 void av1_loop_restoration_filter_unit(
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
-    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    const PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
     int dst_stride, int32_t *tmpbuf, int optimized_lr);
 
@@ -417,7 +417,7 @@ void av1_loop_restoration_filter_unit(
  * \param[in]       optimized_lr  Whether to use fast optimized Loop Restoration
  * \param[in]       lr_ctxt       Loop restoration context
  *
- * \return Nothing is returned. Instead, the filtered frame is output in
+ * \remark Nothing is returned. Instead, the filtered frame is output in
  * \c frame.
  */
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
@@ -439,7 +439,7 @@ typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
 // Call on_rest_unit for each loop restoration unit in the plane.
 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     rest_unit_visitor_t on_rest_unit,
-                                    void *priv, AV1PixelRect *tile_rect,
+                                    void *priv, PixelRect *tile_rect,
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs);
 
@@ -467,13 +467,13 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
                                       struct AV1Common *cm, int num_planes);
 void av1_foreach_rest_unit_in_row(
-    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    RestorationTileLimits *limits, const PixelRect *tile_rect,
     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
     int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
     void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
     sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
     struct AV1LrSyncData *const lr_sync);
-AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
 int av1_lr_count_units_in_tile(int unit_size, int tile_size);
 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
diff --git a/av1/common/scale.c b/av1/common/scale.c
index 5bcd8df0c..d7c6a2437 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c
@@ -16,30 +16,6 @@
 #include "av1/common/scale.h"
 #include "aom_dsp/aom_filter.h"
 
-// Note: Expect val to be in q4 precision
-static INLINE int scaled_x(int val, const struct scale_factors *sf) {
-  const int off =
-      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
-  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
-  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
-                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
-}
-
-// Note: Expect val to be in q4 precision
-static INLINE int scaled_y(int val, const struct scale_factors *sf) {
-  const int off =
-      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
-  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
-  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
-                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
-}
-
-// Note: Expect val to be in q4 precision
-static int unscaled_value(int val, const struct scale_factors *sf) {
-  (void)sf;
-  return val * (1 << SCALE_EXTRA_BITS);
-}
-
 static int get_fixed_point_scale_factor(int other_size, int this_size) {
   // Calculate scaling factor once for each reference frame
   // and use fixed point scaling factors in decoding and encoding routines.
@@ -56,10 +32,12 @@ static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
 // Note: x and y are integer precision, mvq4 is q4 precision.
 MV32 av1_scale_mv(const MV *mvq4, int x, int y,
                   const struct scale_factors *sf) {
-  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
-  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
-  const MV32 res = { scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
-                     scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 };
+  const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf);
+  const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf);
+  const MV32 res = {
+    av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
+    av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4
+  };
   return res;
 }
 
@@ -76,12 +54,4 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
 
   sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
   sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
-
-  if (av1_is_scaled(sf)) {
-    sf->scale_value_x = scaled_x;
-    sf->scale_value_y = scaled_y;
-  } else {
-    sf->scale_value_x = unscaled_value;
-    sf->scale_value_y = unscaled_value;
-  }
 }
diff --git a/av1/common/scale.h b/av1/common/scale.h
index fd30416df..d8481bfc2 100644
--- a/av1/common/scale.h
+++ b/av1/common/scale.h
@@ -30,11 +30,32 @@ struct scale_factors {
   int y_scale_fp;  // vertical fixed point scale factor
   int x_step_q4;
   int y_step_q4;
-
-  int (*scale_value_x)(int val, const struct scale_factors *sf);
-  int (*scale_value_y)(int val, const struct scale_factors *sf);
 };
 
+// Note: Expect val to be in q4 precision
+static INLINE int av1_scaled_x(int val, const struct scale_factors *sf) {
+  const int off =
+      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
+  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int av1_scaled_y(int val, const struct scale_factors *sf) {
+  const int off =
+      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
+  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int av1_unscaled_value(int val, const struct scale_factors *sf) {
+  (void)sf;
+  return val * (1 << SCALE_EXTRA_BITS);
+}
+
 MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
@@ -52,6 +73,7 @@ static INLINE int av1_is_scaled(const struct scale_factors *sf) {
          (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }
 
+// See AV1 spec, Section 6.8.6. Frame size with refs semantics.
 static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
                                        int this_width, int this_height) {
   return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
diff --git a/av1/common/seg_common.h b/av1/common/seg_common.h
index 3ad058c29..44b508b14 100644
--- a/av1/common/seg_common.h
+++ b/av1/common/seg_common.h
@@ -59,7 +59,7 @@ struct segmentation_probs {
 };
 
 static INLINE int segfeature_active(const struct segmentation *seg,
-                                    int segment_id,
+                                    uint8_t segment_id,
                                     SEG_LVL_FEATURES feature_id) {
   return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
 }
@@ -96,6 +96,16 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
   return seg->feature_data[segment_id][feature_id];
 }
 
+static AOM_INLINE void set_segment_id(uint8_t *segment_ids, int mi_offset,
+                                      int x_mis, int y_mis, int mi_stride,
+                                      uint8_t segment_id) {
+  segment_ids += mi_offset;
+  for (int y = 0; y < y_mis; ++y) {
+    memset(&segment_ids[y * mi_stride], segment_id,
+           x_mis * sizeof(segment_ids[0]));
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 54e9443e4..4db0b8e87 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -138,20 +138,6 @@ void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
   }
 }
 
-static void loop_filter_data_reset(LFWorkerData *lf_data,
-                                   YV12_BUFFER_CONFIG *frame_buffer,
-                                   struct AV1Common *cm, MACROBLOCKD *xd) {
-  struct macroblockd_plane *pd = xd->plane;
-  lf_data->frame_buffer = frame_buffer;
-  lf_data->cm = cm;
-  lf_data->xd = xd;
-  for (int i = 0; i < MAX_MB_PLANE; i++) {
-    memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
-    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
-    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
-  }
-}
-
 void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
                          int num_workers) {
   if (num_workers < 1) return;
@@ -261,86 +247,16 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static AOM_FORCE_INLINE bool skip_loop_filter_plane(const int planes_to_lf[3],
-                                                    int plane,
-                                                    int lpf_opt_level) {
-  // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
-  // chroma planes together
-  if (lpf_opt_level == 2) {
-    if (plane == AOM_PLANE_Y) {
-      return !planes_to_lf[plane];
-    }
-    if (plane == AOM_PLANE_U) {
-      // U and V are handled together
-      return !planes_to_lf[1] && !planes_to_lf[2];
-    }
-    assert(plane == AOM_PLANE_V);
-    if (plane == AOM_PLANE_V) {
-      // V is handled when u is filtered
-      return true;
-    }
-  }
-
-  // Normal operation mode
-  return !planes_to_lf[plane];
-}
-
-static void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
-                            const int planes_to_lf[3], int lpf_opt_level) {
-  int mi_row, plane, dir;
-  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
-  lf_sync->jobs_enqueued = 0;
-  lf_sync->jobs_dequeued = 0;
-
-  // Launch all vertical jobs first, as they are blocking the horizontal ones.
-  // Launch top row jobs for all planes first, in case the output can be
-  // partially reconstructed row by row.
-  for (dir = 0; dir < 2; ++dir) {
-    for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-      for (plane = 0; plane < 3; ++plane) {
-        if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
-          continue;
-        }
-        if (!planes_to_lf[plane]) continue;
-        lf_job_queue->mi_row = mi_row;
-        lf_job_queue->plane = plane;
-        lf_job_queue->dir = dir;
-        lf_job_queue->lpf_opt_level = lpf_opt_level;
-        lf_job_queue++;
-        lf_sync->jobs_enqueued++;
-      }
-    }
-  }
-}
-
-static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
-  AV1LfMTInfo *cur_job_info = NULL;
-
-#if CONFIG_MULTITHREAD
-  pthread_mutex_lock(lf_sync->job_mutex);
-
-  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
-    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
-    lf_sync->jobs_dequeued++;
-  }
-
-  pthread_mutex_unlock(lf_sync->job_mutex);
-#else
-  (void)lf_sync;
-#endif
-
-  return cur_job_info;
-}
-
 // One job of row loopfiltering.
-static INLINE void thread_loop_filter_rows(
+void av1_thread_loop_filter_rows(
     const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
     struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
     int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
-    AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf) {
+    AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf,
+    int num_mis_in_lpf_unit_height_log2) {
   const int sb_cols =
       CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
-  const int r = mi_row >> MAX_MIB_SIZE_LOG2;
+  const int r = mi_row >> num_mis_in_lpf_unit_height_log2;
   int mi_col, c;
 
   const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y;
@@ -356,11 +272,12 @@ static INLINE void thread_loop_filter_rows(
       if (lpf_opt_level) {
         if (plane == AOM_PLANE_Y) {
           av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row,
-                                          mi_col, params_buf, tx_buf);
+                                          mi_col, params_buf, tx_buf,
+                                          num_mis_in_lpf_unit_height_log2);
         } else {
-          av1_filter_block_plane_vert_opt_chroma(cm, xd, &planes[plane], mi_row,
-                                                 mi_col, params_buf, tx_buf,
-                                                 plane, joint_filter_chroma);
+          av1_filter_block_plane_vert_opt_chroma(
+              cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
+              joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
         }
       } else {
         av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
@@ -388,11 +305,12 @@ static INLINE void thread_loop_filter_rows(
       if (lpf_opt_level) {
         if (plane == AOM_PLANE_Y) {
           av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row,
-                                          mi_col, params_buf, tx_buf);
+                                          mi_col, params_buf, tx_buf,
+                                          num_mis_in_lpf_unit_height_log2);
         } else {
-          av1_filter_block_plane_horz_opt_chroma(cm, xd, &planes[plane], mi_row,
-                                                 mi_col, params_buf, tx_buf,
-                                                 plane, joint_filter_chroma);
+          av1_filter_block_plane_horz_opt_chroma(
+              cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
+              joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
         }
       } else {
         av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
@@ -409,10 +327,11 @@ static int loop_filter_row_worker(void *arg1, void *arg2) {
   AV1LfMTInfo *cur_job_info;
   while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
     const int lpf_opt_level = cur_job_info->lpf_opt_level;
-    thread_loop_filter_rows(
+    av1_thread_loop_filter_rows(
         lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
         cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
-        lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf);
+        lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
+        MAX_MIB_SIZE_LOG2);
   }
   return 1;
 }
@@ -423,24 +342,9 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 int num_workers, AV1LfSync *lf_sync,
                                 int lpf_opt_level) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  // Number of superblock rows and cols
-  const int sb_rows =
-      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2);
   int i;
-
-  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
-      num_workers > lf_sync->num_workers) {
-    av1_loop_filter_dealloc(lf_sync);
-    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
-  }
-
-  // Initialize cur_sb_col to -1 for all SB rows.
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    memset(lf_sync->cur_sb_col[i], -1,
-           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
-  }
-
-  enqueue_lf_jobs(lf_sync, start, stop, planes_to_lf, lpf_opt_level);
+  loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync,
+                            lpf_opt_level, MAX_MIB_SIZE_LOG2);
 
   // Set up loopfilter thread data.
   for (i = num_workers - 1; i >= 0; --i) {
@@ -484,9 +388,9 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
       }
 
       for (dir = 0; dir < 2; ++dir) {
-        thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane, dir,
-                                lpf_opt_level, /*lf_sync=*/NULL, params_buf,
-                                tx_buf);
+        av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane,
+                                    dir, lpf_opt_level, /*lf_sync=*/NULL,
+                                    params_buf, tx_buf, MAX_MIB_SIZE_LOG2);
       }
     }
   }
@@ -500,15 +404,9 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   int start_mi_row, end_mi_row, mi_rows_to_filter;
   int planes_to_lf[3];
 
-  // For each luma and chroma plane, whether to filter it or not.
-  planes_to_lf[0] = (cm->lf.filter_level[0] || cm->lf.filter_level[1]) &&
-                    plane_start <= 0 && 0 < plane_end;
-  planes_to_lf[1] = cm->lf.filter_level_u && plane_start <= 1 && 1 < plane_end;
-  planes_to_lf[2] = cm->lf.filter_level_v && plane_start <= 2 && 2 < plane_end;
-  // If the luma plane is purposely not filtered, neither are the chroma planes.
-  if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return;
-  // Early exit.
-  if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return;
+  if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start,
+                                   plane_end))
+    return;
 
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_params.mi_rows;
@@ -722,7 +620,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
     const int is_uv = plane > 0;
     const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
-    AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    PixelRect tile_rect = ctxt[plane].tile_rect;
     const int unit_size = ctxt[plane].rsi->restoration_unit_size;
 
     const int tile_h = tile_rect.bottom - tile_rect.top;
@@ -845,6 +743,12 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
       copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
                        ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
                        cur_job_info->v_copy_end);
+
+      if (lrworkerdata->do_extend_border) {
+        aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane,
+                                           cur_job_info->v_copy_start,
+                                           cur_job_info->v_copy_end);
+      }
     } else {
       break;
     }
@@ -854,7 +758,8 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
 
 static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
                                            AVxWorker *workers, int nworkers,
-                                           AV1LrSync *lr_sync, AV1_COMMON *cm) {
+                                           AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                           int do_extend_border) {
   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
 
   const int num_planes = av1_num_planes(cm);
@@ -865,7 +770,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
 
-    const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const PixelRect tile_rect = ctxt[plane].tile_rect;
     const int max_tile_h = tile_rect.bottom - tile_rect.top;
 
     const int unit_size = cm->rst_info[plane].restoration_unit_size;
@@ -897,6 +802,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   for (i = num_workers - 1; i >= 0; --i) {
     AVxWorker *const worker = &workers[i];
     lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+    lr_sync->lrworkerdata[i].do_extend_border = do_extend_border;
     worker->hook = loop_restoration_row_worker;
     worker->data1 = lr_sync;
     worker->data2 = &lr_sync->lrworkerdata[i];
@@ -918,7 +824,8 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                           AV1_COMMON *cm, int optimized_lr,
                                           AVxWorker *workers, int num_workers,
-                                          AV1LrSync *lr_sync, void *lr_ctxt) {
+                                          AV1LrSync *lr_sync, void *lr_ctxt,
+                                          int do_extend_border) {
   assert(!cm->features.all_lossless);
 
   const int num_planes = av1_num_planes(cm);
@@ -929,7 +836,7 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                          optimized_lr, num_planes);
 
   foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
-                                 cm);
+                                 cm, do_extend_border);
 }
 
 // Initializes cdef_sync parameters.
@@ -1002,13 +909,27 @@ static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
 static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
   AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
   AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
-  const int nvfb =
-      (cdef_worker->cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  AV1_COMMON *cm = cdef_worker->cm;
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   int cur_fbr;
+  const int num_planes = av1_num_planes(cm);
   while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
-    av1_cdef_fb_row(cdef_worker->cm, cdef_worker->xd, cdef_worker->linebuf,
-                    cdef_worker->colbuf, cdef_worker->srcbuf, cur_fbr,
+    MACROBLOCKD *xd = cdef_worker->xd;
+    av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf,
+                    cdef_worker->srcbuf, cur_fbr,
                     cdef_worker->cdef_init_fb_row_fn, cdef_sync);
+    if (cdef_worker->do_extend_border) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
+        const int is_uv = plane > 0;
+        const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+        const int unit_height = MI_SIZE_64X64 << mi_high;
+        const int v_start = cur_fbr * unit_height;
+        const int v_end =
+            AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]);
+        aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end);
+      }
+    }
   }
   return 1;
 }
@@ -1017,7 +938,8 @@ static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
 static void prepare_cdef_frame_workers(
     AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker,
     AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync,
-    int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn) {
+    int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn,
+    int do_extend_border) {
   const int num_planes = av1_num_planes(cm);
 
   cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
@@ -1028,6 +950,7 @@ static void prepare_cdef_frame_workers(
     cdef_worker[i].cm = cm;
     cdef_worker[i].xd = xd;
     cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
+    cdef_worker[i].do_extend_border = do_extend_border;
     for (int plane = 0; plane < num_planes; plane++)
       cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
 
@@ -1111,8 +1034,8 @@ void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
 void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                        AV1CdefWorkerData *const cdef_worker,
                        AVxWorker *const workers, AV1CdefSync *const cdef_sync,
-                       int num_workers,
-                       cdef_init_fb_row_t cdef_init_fb_row_fn) {
+                       int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn,
+                       int do_extend_border) {
   YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
   const int num_planes = av1_num_planes(cm);
 
@@ -1122,7 +1045,7 @@ void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   reset_cdef_job_info(cdef_sync);
   prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
                              workers, cdef_sync, num_workers,
-                             cdef_init_fb_row_fn);
+                             cdef_init_fb_row_fn, do_extend_border);
   launch_cdef_workers(workers, num_workers);
   sync_cdef_workers(workers, cm, num_workers);
 }
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 7c284faa6..4cf23f2fc 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -70,6 +70,7 @@ typedef struct LoopRestorationWorkerData {
   int32_t *rst_tmpbuf;
   void *rlbs;
   void *lr_ctxt;
+  int do_extend_border;
 } LRWorkerData;
 
 // Looprestoration row synchronization
@@ -106,6 +107,7 @@ typedef struct AV1CdefWorker {
   uint16_t *srcbuf;
   uint16_t *linebuf[MAX_MB_PLANE];
   cdef_init_fb_row_t cdef_init_fb_row_fn;
+  int do_extend_border;
 } AV1CdefWorkerData;
 
 typedef struct AV1CdefRowSync {
@@ -135,7 +137,8 @@ typedef struct AV1CdefSyncData {
 void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                        AV1CdefWorkerData *const cdef_worker,
                        AVxWorker *const workers, AV1CdefSync *const cdef_sync,
-                       int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn);
+                       int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn,
+                       int do_extend_border);
 void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
                              const MACROBLOCKD *const xd,
                              CdefBlockInfo *const fb_info,
@@ -144,6 +147,14 @@ void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
 void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
                           int dstride, const uint8_t *src, int src_voffset,
                           int src_hoffset, int sstride, int vsize, int hsize);
+void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride,
+                                const uint8_t *src, int src_voffset,
+                                int src_hoffset, int sstride, int vsize,
+                                int hsize);
+void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride,
+                                 const uint8_t *src, int src_voffset,
+                                 int src_hoffset, int sstride, int vsize,
+                                 int hsize);
 void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
                          int num_workers);
 void av1_free_cdef_sync(AV1CdefSync *cdef_sync);
@@ -163,13 +174,148 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                           struct AV1Common *cm,
                                           int optimized_lr, AVxWorker *workers,
                                           int num_workers, AV1LrSync *lr_sync,
-                                          void *lr_ctxt);
+                                          void *lr_ctxt, int do_extend_border);
 void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
 void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
                                 int num_workers, int num_rows_lr,
                                 int num_planes, int width);
 int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm);
 
+void av1_thread_loop_filter_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
+    int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+    AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2);
+
+static AOM_FORCE_INLINE bool skip_loop_filter_plane(const int planes_to_lf[3],
+                                                    int plane,
+                                                    int lpf_opt_level) {
+  // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
+  // chroma planes together
+  if (lpf_opt_level == 2) {
+    if (plane == AOM_PLANE_Y) {
+      return !planes_to_lf[plane];
+    }
+    if (plane == AOM_PLANE_U) {
+      // U and V are handled together
+      return !planes_to_lf[1] && !planes_to_lf[2];
+    }
+    assert(plane == AOM_PLANE_V);
+    if (plane == AOM_PLANE_V) {
+      // V is handled when u is filtered
+      return true;
+    }
+  }
+
+  // Normal operation mode
+  return !planes_to_lf[plane];
+}
+
+static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
+                                       const int planes_to_lf[3],
+                                       int lpf_opt_level,
+                                       int num_mis_in_lpf_unit_height) {
+  int mi_row, plane, dir;
+  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+  lf_sync->jobs_enqueued = 0;
+  lf_sync->jobs_dequeued = 0;
+
+  // Launch all vertical jobs first, as they are blocking the horizontal ones.
+  // Launch top row jobs for all planes first, in case the output can be
+  // partially reconstructed row by row.
+  for (dir = 0; dir < 2; ++dir) {
+    for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) {
+      for (plane = 0; plane < 3; ++plane) {
+        if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
+          continue;
+        }
+        if (!planes_to_lf[plane]) continue;
+        lf_job_queue->mi_row = mi_row;
+        lf_job_queue->plane = plane;
+        lf_job_queue->dir = dir;
+        lf_job_queue->lpf_opt_level = lpf_opt_level;
+        lf_job_queue++;
+        lf_sync->jobs_enqueued++;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void loop_filter_frame_mt_init(
+    AV1_COMMON *cm, int start_mi_row, int end_mi_row, const int planes_to_lf[3],
+    int num_workers, AV1LfSync *lf_sync, int lpf_opt_level,
+    int num_mis_in_lpf_unit_height_log2) {
+  // Number of superblock rows
+  const int sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    av1_loop_filter_dealloc(lf_sync);
+    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (int i = 0; i < MAX_MB_PLANE; i++) {
+    memset(lf_sync->cur_sb_col[i], -1,
+           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
+  }
+
+  enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf,
+                  lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2));
+}
+
+static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+  AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lf_sync->job_mutex);
+
+  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+    lf_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+  (void)lf_sync;
+#endif
+
+  return cur_job_info;
+}
+
+static AOM_INLINE void loop_filter_data_reset(LFWorkerData *lf_data,
+                                              YV12_BUFFER_CONFIG *frame_buffer,
+                                              struct AV1Common *cm,
+                                              MACROBLOCKD *xd) {
+  struct macroblockd_plane *pd = xd->plane;
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->xd = xd;
+  for (int i = 0; i < MAX_MB_PLANE; i++) {
+    memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+  }
+}
+
+static AOM_INLINE int check_planes_to_loop_filter(const struct loopfilter *lf,
+                                                  int *planes_to_lf,
+                                                  int plane_start,
+                                                  int plane_end) {
+  // For each luma and chroma plane, whether to filter it or not.
+  planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) &&
+                    plane_start <= 0 && 0 < plane_end;
+  planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end;
+  planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end;
+  // If the luma plane is purposely not filtered, neither are the chroma
+  // planes.
+  if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0;
+  // Early exit.
+  if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0;
+  return 1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c
index 6ecead818..45f4d0c04 100644
--- a/av1/common/tile_common.c
+++ b/av1/common/tile_common.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <stdbool.h>
+
 #include "av1/common/av1_common_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/tile_common.h"
@@ -37,7 +39,24 @@ void av1_get_tile_limits(AV1_COMMON *const cm) {
 
   const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
   tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
-  const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+  bool use_level_7_above = false;
+  for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+    if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 &&
+        seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) {
+      // Currently it is assumed that levels 7.x and 8.x are either used for all
+      // operating points, or none of them.
+      if (i != 0 && !use_level_7_above) {
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Either all the operating points are levels 7.x or "
+                           "8.x, or none of them are.");
+      }
+      use_level_7_above = true;
+    }
+  }
+  const int max_tile_area_sb =
+      (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >>
+      (2 * sb_size_log2);
 
   tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
   tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
@@ -154,9 +173,9 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
                            cm->seq_params->mib_size_log2);
 }
 
-AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
-                               int is_uv) {
-  AV1PixelRect r;
+PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+                            int is_uv) {
+  PixelRect r;
 
   // Calculate position in the Y plane
   r.left = tile_info->mi_col_start * MI_SIZE;
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index 5e90d95e7..9a7200901 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h
@@ -17,6 +17,7 @@ extern "C" {
 #endif
 
 #include "config/aom_config.h"
+#include "aom_dsp/rect.h"
 
 struct AV1Common;
 struct SequenceHeader;
@@ -42,19 +43,16 @@ void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
 int av1_get_sb_rows_in_tile(struct AV1Common *cm, const TileInfo *tile);
 int av1_get_sb_cols_in_tile(struct AV1Common *cm, const TileInfo *tile);
 
-typedef struct {
-  int left, top, right, bottom;
-} AV1PixelRect;
-
 // Return the pixel extents of the given tile
-AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
-                               const struct AV1Common *cm, int is_uv);
+PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+                            const struct AV1Common *cm, int is_uv);
 
 // Define tile maximum width and area
 // There is no maximum height since height is limited by area and width limits
 // The minimum tile width or height is fixed at one superblock
 #define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
 #define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
+#define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608)
 
 void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
 void av1_get_tile_limits(struct AV1Common *const cm);
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h
index 5ba3951e8..40fcffca9 100644
--- a/av1/common/txb_common.h
+++ b/av1/common/txb_common.h
@@ -78,71 +78,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) {
   return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
 }
 
-static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
-                                              int sig_mag) {
-  const int ctx = base_level_count_to_index[count];
-  int ctx_idx = -1;
-
-  if (row == 0 && col == 0) {
-    if (sig_mag >= 2) return 0;
-
-    if (sig_mag == 1) {
-      if (count >= 2)
-        ctx_idx = 1;
-      else
-        ctx_idx = 2;
-
-      return ctx_idx;
-    }
-
-    ctx_idx = 3 + ctx;
-    assert(ctx_idx <= 6);
-    return ctx_idx;
-  } else if (row == 0) {
-    if (sig_mag >= 2) return 6;
-    if (sig_mag == 1) {
-      if (count >= 2)
-        ctx_idx = 7;
-      else
-        ctx_idx = 8;
-      return ctx_idx;
-    }
-
-    ctx_idx = 9 + ctx;
-    assert(ctx_idx <= 11);
-    return ctx_idx;
-  } else if (col == 0) {
-    if (sig_mag >= 2) return 12;
-    if (sig_mag == 1) {
-      if (count >= 2)
-        ctx_idx = 13;
-      else
-        ctx_idx = 14;
-
-      return ctx_idx;
-    }
-
-    ctx_idx = 15 + ctx;
-    assert(ctx_idx <= 17);
-    // TODO(angiebird): turn this on once the optimization is finalized
-    // assert(ctx_idx < 28);
-  } else {
-    if (sig_mag >= 2) return 18;
-    if (sig_mag == 1) {
-      if (count >= 2)
-        ctx_idx = 19;
-      else
-        ctx_idx = 20;
-      return ctx_idx;
-    }
-
-    ctx_idx = 21 + ctx;
-
-    assert(ctx_idx <= 24);
-  }
-  return ctx_idx;
-}
-
 static INLINE int get_br_ctx_2d(const uint8_t *const levels,
                                 const int c,  // raster order
                                 const int bwl) {
@@ -351,11 +286,11 @@ static INLINE void set_dc_sign(int *cul_level, int dc_val) {
     *cul_level += 2 << COEFF_CONTEXT_BITS;
 }
 
-static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
-                               const TX_SIZE tx_size, const int plane,
-                               const ENTROPY_CONTEXT *const a,
-                               const ENTROPY_CONTEXT *const l,
-                               TXB_CTX *const txb_ctx) {
+static void get_txb_ctx_general(const BLOCK_SIZE plane_bsize,
+                                const TX_SIZE tx_size, const int plane,
+                                const ENTROPY_CONTEXT *const a,
+                                const ENTROPY_CONTEXT *const l,
+                                TXB_CTX *const txb_ctx) {
 #define MAX_TX_SIZE_UNIT 16
   static const int8_t signs[3] = { 0, -1, 1 };
   static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
@@ -437,7 +372,100 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
                                : 7;
     txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
   }
-#undef MAX_TX_SIZE_UNIT
 }
 
+#define SPECIALIZE_GET_TXB_CTX(w, h)                                          \
+  static void get_txb_ctx_##w##x##h(                                          \
+      const BLOCK_SIZE plane_bsize, const int plane,                          \
+      const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l,         \
+      TXB_CTX *const txb_ctx) {                                               \
+    static const int8_t signs[3] = { 0, -1, 1 };                              \
+    static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {        \
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,       \
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,       \
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2           \
+    };                                                                        \
+    const TX_SIZE tx_size = TX_##w##X##h;                                     \
+    const int txb_w_unit = tx_size_wide_unit[tx_size];                        \
+    const int txb_h_unit = tx_size_high_unit[tx_size];                        \
+    int dc_sign = 0;                                                          \
+    int k = 0;                                                                \
+                                                                              \
+    do {                                                                      \
+      const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;        \
+      assert(sign <= 2);                                                      \
+      dc_sign += signs[sign];                                                 \
+    } while (++k < txb_w_unit);                                               \
+                                                                              \
+    k = 0;                                                                    \
+    do {                                                                      \
+      const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;        \
+      assert(sign <= 2);                                                      \
+      dc_sign += signs[sign];                                                 \
+    } while (++k < txb_h_unit);                                               \
+                                                                              \
+    txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];  \
+                                                                              \
+    if (plane == 0) {                                                         \
+      if (plane_bsize == txsize_to_bsize[tx_size]) {                          \
+        txb_ctx->txb_skip_ctx = 0;                                            \
+      } else {                                                                \
+        static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },       \
+                                                     { 2, 4, 4, 4, 5 },       \
+                                                     { 2, 4, 4, 4, 5 },       \
+                                                     { 2, 4, 4, 4, 5 },       \
+                                                     { 3, 5, 5, 5, 6 } };     \
+        int top = 0;                                                          \
+        int left = 0;                                                         \
+                                                                              \
+        k = 0;                                                                \
+        do {                                                                  \
+          top |= a[k];                                                        \
+        } while (++k < txb_w_unit);                                           \
+        top &= COEFF_CONTEXT_MASK;                                            \
+        top = AOMMIN(top, 4);                                                 \
+                                                                              \
+        k = 0;                                                                \
+        do {                                                                  \
+          left |= l[k];                                                       \
+        } while (++k < txb_h_unit);                                           \
+        left &= COEFF_CONTEXT_MASK;                                           \
+        left = AOMMIN(left, 4);                                               \
+                                                                              \
+        txb_ctx->txb_skip_ctx = skip_contexts[top][left];                     \
+      }                                                                       \
+    } else {                                                                  \
+      const int ctx_base = get_entropy_context(tx_size, a, l);                \
+      const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >             \
+                              num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
+                                 ? 10                                         \
+                                 : 7;                                         \
+      txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;                          \
+    }                                                                         \
+  }
+
+SPECIALIZE_GET_TXB_CTX(4, 4)
+SPECIALIZE_GET_TXB_CTX(8, 8)
+SPECIALIZE_GET_TXB_CTX(16, 16)
+SPECIALIZE_GET_TXB_CTX(32, 32)
+
+// Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_*
+// so that the compiler can compile away the while loops.
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+                               const TX_SIZE tx_size, const int plane,
+                               const ENTROPY_CONTEXT *const a,
+                               const ENTROPY_CONTEXT *const l,
+                               TXB_CTX *const txb_ctx) {
+  switch (tx_size) {
+    case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break;
+    case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break;
+    case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break;
+    case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break;
+    default:
+      get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx);
+      break;
+  }
+}
+#undef MAX_TX_SIZE_UNIT
+
 #endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 14dc0fe47..d6fe325e0 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -25,7 +25,6 @@
 #include "av1/common/mv.h"
 #include "av1/common/convolve.h"
 
-#define MAX_PARAMDIM 9
 #define LEAST_SQUARES_SAMPLES_MAX_BITS 3
 #define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 0bdf49f0f..67b28bc29 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -22,7 +22,7 @@
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
 static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
                      int h, int subpel_x_qn, int x_step_qn,
-                     const InterpFilterParams *filter_params, unsigned round) {
+                     const InterpFilterParams *filter_params, int round) {
   const int bd = 8;
   const int ntaps = 8;
 
@@ -168,11 +168,11 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
           _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
 
       uint8_t *dst_x = dst + y * dst_stride + x;
-      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
       __m128i result;
       __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
 
       if (conv_params->is_compound) {
+        CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
         if (conv_params->do_average) {
           const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
           if (conv_params->use_dist_wtd_comp_avg) {
@@ -260,8 +260,8 @@ void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
 // filters.
 static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
                             int w, int h, int subpel_x_qn, int x_step_qn,
-                            const InterpFilterParams *filter_params,
-                            unsigned round, int bd) {
+                            const InterpFilterParams *filter_params, int round,
+                            int bd) {
   const int ntaps = 8;
 
   src -= ntaps / 2 - 1;
@@ -399,10 +399,10 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
           _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
 
       uint16_t *dst_x = dst + y * dst_stride + x;
-      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
 
       __m128i result;
       if (conv_params->is_compound) {
+        CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
         if (conv_params->do_average) {
           __m128i p_32 =
               _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
@@ -414,20 +414,20 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
           } else {
             shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
           }
-          __m128i res32 = _mm_sub_epi32(shifted, sub);
-          res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
-                                round_bits_shift);
+          result = _mm_sub_epi32(shifted, sub);
+          result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const),
+                                 round_bits_shift);
 
-          __m128i res16 = _mm_packus_epi32(res32, res32);
-          res16 = _mm_min_epi16(res16, clip_pixel_);
-          _mm_storel_epi64((__m128i *)dst_x, res16);
+          result = _mm_packus_epi32(result, result);
+          result = _mm_min_epi16(result, clip_pixel_);
+          _mm_storel_epi64((__m128i *)dst_x, result);
         } else {
           __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
           _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
         }
       } else {
-        const __m128i subbed = _mm_sub_epi32(shifted, sub);
-        result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+        result = _mm_sub_epi32(shifted, sub);
+        result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift);
         result = _mm_packus_epi32(result, result);
         result = _mm_min_epi16(result, clip_pixel_);
         _mm_storel_epi64((__m128i *)dst_x, result);
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index f9bfb370a..738cc9848 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -2246,7 +2246,7 @@ static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
   const int step = flipud ? -1 : 1;
   const __m128i zero = _mm_setzero_si128();
   for (int i = 0; i < height; ++i, j += step) {
-    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
     __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
     u = _mm_packus_epi16(u, zero);
     *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
diff --git a/av1/common/x86/cdef_block_avx2.c b/av1/common/x86/cdef_block_avx2.c
index 3396a51cf..1ec4b6c33 100644
--- a/av1/common/x86/cdef_block_avx2.c
+++ b/av1/common/x86/cdef_block_avx2.c
@@ -255,38 +255,101 @@ void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
 }
 
 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
-                                        const uint8_t *src, int sstride, int v,
-                                        int h) {
-  int i = 0, j = 0;
-  int remaining_width = h;
-
-  // Process multiple 16 pixels at a time.
-  if (h > 15) {
-    for (i = 0; i < v; i++) {
-      for (j = 0; j < h - 15; j += 16) {
-        __m128i row = _mm_loadu_si128((__m128i *)&src[i * sstride + j]);
-        _mm256_storeu_si256((__m256i *)&dst[i * dstride + j],
-                            _mm256_cvtepu8_epi16(row));
-      }
-    }
-    remaining_width = h & 0xe;
+                                        const uint8_t *src, int sstride,
+                                        int width, int height) {
+  int j = 0;
+  int remaining_width = width;
+  assert(height % 2 == 0);
+  assert(height > 0);
+  assert(width > 0);
+
+  // Process multiple 32 pixels at a time.
+  if (remaining_width > 31) {
+    int i = 0;
+    do {
+      j = 0;
+      do {
+        __m128i row00 =
+            _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
+        __m128i row01 = _mm_loadu_si128(
+            (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
+        __m128i row10 =
+            _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
+        __m128i row11 = _mm_loadu_si128(
+            (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
+        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
+                            _mm256_cvtepu8_epi16(row00));
+        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
+                            _mm256_cvtepu8_epi16(row01));
+        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
+                            _mm256_cvtepu8_epi16(row10));
+        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
+                            _mm256_cvtepu8_epi16(row11));
+        j += 32;
+      } while (j <= width - 32);
+      i += 2;
+    } while (i < height);
+    remaining_width = width & 31;
+  }
+
+  // Process 16 pixels at a time.
+  if (remaining_width > 15) {
+    int i = 0;
+    do {
+      __m128i row0 =
+          _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
+      __m128i row1 =
+          _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
+      _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
+                          _mm256_cvtepu8_epi16(row0));
+      _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
+                          _mm256_cvtepu8_epi16(row1));
+      i += 2;
+    } while (i < height);
+    remaining_width = width & 15;
+    j += 16;
   }
 
-  // Process multiple 8 pixels at a time.
+  // Process 8 pixels at a time.
   if (remaining_width > 7) {
-    for (i = 0; i < v; i++) {
-      __m128i row = _mm_loadl_epi64((__m128i *)&src[i * sstride + j]);
-      _mm_storeu_si128((__m128i *)&dst[i * dstride + j],
-                       _mm_unpacklo_epi8(row, _mm_setzero_si128()));
-    }
-    remaining_width = h & 0x7;
+    int i = 0;
+    do {
+      __m128i row0 =
+          _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
+      __m128i row1 =
+          _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
+      _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
+                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
+      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
+                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
+      i += 2;
+    } while (i < height);
+    remaining_width = width & 7;
     j += 8;
   }
 
+  // Process 4 pixels at a time.
+  if (remaining_width > 3) {
+    int i = 0;
+    do {
+      __m128i row0 =
+          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
+      __m128i row1 =
+          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
+      _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
+                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
+      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
+                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
+      i += 2;
+    } while (i < height);
+    remaining_width = width & 3;
+    j += 4;
+  }
+
   // Process the remaining pixels.
   if (remaining_width) {
-    for (i = 0; i < v; i++) {
-      for (int k = j; k < h; k++) {
+    for (int i = 0; i < height; i++) {
+      for (int k = j; k < width; k++) {
         dst[i * dstride + k] = src[i * sstride + k];
       }
     }
diff --git a/av1/common/x86/cdef_block_sse2.c b/av1/common/x86/cdef_block_sse2.c
index faf51fdd9..5ab7ffa2f 100644
--- a/av1/common/x86/cdef_block_sse2.c
+++ b/av1/common/x86/cdef_block_sse2.c
@@ -25,15 +25,15 @@ void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2,
 }
 
 void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride,
-                                        const uint8_t *src, int sstride, int v,
-                                        int h) {
+                                        const uint8_t *src, int sstride,
+                                        int width, int height) {
   int j = 0;
-  for (int i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
+  for (int i = 0; i < height; i++) {
+    for (j = 0; j < (width & ~0x7); j += 8) {
       v64 row = v64_load_unaligned(&src[i * sstride + j]);
       v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
     }
-    for (; j < h; j++) {
+    for (; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
diff --git a/av1/common/x86/cdef_block_sse4.c b/av1/common/x86/cdef_block_sse4.c
index f87d15845..344c1e47c 100644
--- a/av1/common/x86/cdef_block_sse4.c
+++ b/av1/common/x86/cdef_block_sse4.c
@@ -26,14 +26,14 @@ void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2,
 
 void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride,
                                           const uint8_t *src, int sstride,
-                                          int v, int h) {
+                                          int width, int height) {
   int j = 0;
-  for (int i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
+  for (int i = 0; i < height; i++) {
+    for (j = 0; j < (width & ~0x7); j += 8) {
       v64 row = v64_load_unaligned(&src[i * sstride + j]);
       v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
     }
-    for (; j < h; j++) {
+    for (; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
diff --git a/av1/common/x86/cdef_block_ssse3.c b/av1/common/x86/cdef_block_ssse3.c
index a2faf79e3..0fb36eb6e 100644
--- a/av1/common/x86/cdef_block_ssse3.c
+++ b/av1/common/x86/cdef_block_ssse3.c
@@ -25,15 +25,15 @@ void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2,
 }
 
 void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride,
-                                         const uint8_t *src, int sstride, int v,
-                                         int h) {
+                                         const uint8_t *src, int sstride,
+                                         int width, int height) {
   int j;
-  for (int i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
+  for (int i = 0; i < height; i++) {
+    for (j = 0; j < (width & ~0x7); j += 8) {
       v64 row = v64_load_unaligned(&src[i * sstride + j]);
       v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
     }
-    for (; j < h; j++) {
+    for (; j < width; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index 04112ff9b..1b39a0a8d 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -13,19 +13,21 @@
 
 #include "config/av1_rtcd.h"
 
+#include "third_party/SVT-AV1/convolve_2d_avx2.h"
+
 #include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
-#include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/synonyms.h"
+
 #include "av1/common/convolve.h"
 
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn,
+                                     const int subpel_y_qn,
+                                     ConvolveParams *conv_params) {
   if (filter_params_x->taps > 8) {
     const int bd = 8;
     int im_stride = 8, i;
@@ -92,29 +94,11 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
     __m256i filt[4], coeffs_h[4], coeffs_v[4];
 
-    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
-    filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-
     prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
     prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
 
-    const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
-    int horiz_tap = SUBPEL_TAPS;
-    int vert_tap = SUBPEL_TAPS;
-
-    if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
-      horiz_tap = 4;
-    else if (!(filter_x[0] | filter_x[7]))
-      horiz_tap = 6;
-
-    if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
-      vert_tap = 4;
-    else if (!(filter_y[0] | filter_y[7]))
-      vert_tap = 6;
+    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
+    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
 
     if (horiz_tap == 6)
       prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
@@ -131,8 +115,10 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     const int fo_horiz = horiz_tap / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
 
     for (int j = 0; j < w; j += 8) {
       if (horiz_tap == 4) {
@@ -153,3 +139,23 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     }
   }
 }
+
+void av1_convolve_2d_sr_avx2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
+    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
+  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
+  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
+
+  const bool use_general = (tap_x == 12 || tap_y == 12);
+  if (use_general) {
+    av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_q4, subpel_y_q4, conv_params);
+  } else {
+    av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+                                        filter_params_x, filter_params_y,
+                                        subpel_x_q4, subpel_y_q4, conv_params);
+  }
+}
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 89e0a4c8f..30de98232 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -13,16 +13,16 @@
 
 #include "config/av1_rtcd.h"
 
+#include "third_party/SVT-AV1/convolve_avx2.h"
+
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/synonyms.h"
 
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_y_qn) {
-  int i, j, vert_tap = SUBPEL_TAPS;
+static AOM_INLINE void av1_convolve_y_sr_general_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
@@ -32,16 +32,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   __m256i coeffs[6], s[12];
   __m128i d[10];
 
-  // Condition for checking valid vert_filt taps
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  if (filter_params_y->taps == 12) {
-    vert_tap = 12;
-  } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    vert_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    vert_tap = 6;
-  }
+  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
 
   if (vert_tap == 6)
     prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
@@ -55,7 +46,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   if (vert_tap == 4) {
     const int fo_vert = 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
-    for (j = 0; j < w; j += 16) {
+    for (int j = 0; j < w; j += 16) {
       const uint8_t *data = &src_ptr[j];
       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
@@ -150,7 +141,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     const int fo_vert = vert_tap / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
 
-    for (j = 0; j < w; j += 16) {
+    for (int j = 0; j < w; j += 16) {
       const uint8_t *data = &src_ptr[j];
       __m256i src6;
 
@@ -255,7 +246,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     right_shift = _mm_cvtsi32_si128(FILTER_BITS);
     right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
 
-    for (j = 0; j < w; j += 8) {
+    for (int j = 0; j < w; j += 8) {
       const uint8_t *data = &src_ptr[j];
       __m256i src10;
 
@@ -403,7 +394,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     const int fo_vert = filter_params_y->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
 
-    for (j = 0; j < w; j += 16) {
+    for (int j = 0; j < w; j += 16) {
       const uint8_t *data = &src_ptr[j];
       __m256i src6;
 
@@ -517,18 +508,33 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const int subpel_x_qn,
-                            ConvolveParams *conv_params) {
+void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t w,
+                            int32_t h,
+                            const InterpFilterParams *filter_params_y,
+                            const int32_t subpel_y_q4) {
+  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+  if (vert_tap == 12) {
+    av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_q4);
+  } else {
+    av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+                                       filter_params_y, subpel_y_q4);
+  }
+}
+
+static AOM_INLINE void av1_convolve_x_sr_general_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
   const int bits = FILTER_BITS - conv_params->round_0;
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
   __m256i round_0_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
-  int i, horiz_tap = SUBPEL_TAPS;
+  int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -539,16 +545,6 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  if (filter_params_x->taps == 12) {
-    horiz_tap = 12;
-  } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    horiz_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    horiz_tap = 6;
-  }
-
   if (horiz_tap == 6)
     prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
   else if (horiz_tap == 12) {
@@ -900,3 +896,21 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     }
   }
 }
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t w,
+                            int32_t h,
+                            const InterpFilterParams *filter_params_x,
+                            const int32_t subpel_x_q4,
+                            ConvolveParams *conv_params) {
+  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+  if (horz_tap == 12) {
+    av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_q4, conv_params);
+  } else {
+    av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+                                       filter_params_x, subpel_x_q4,
+                                       conv_params);
+  }
+}
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 0e7782250..012e75c1a 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -200,31 +200,31 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
     if (w <= 4) {
       __m128i s[8], src6, res, res_round, res16;
       int res_int;
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
       s[0] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
       s[1] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
       s[2] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
       s[3] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
       s[4] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
       s[5] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
 
       do {
         s[6] = _mm_unpacklo_epi8(
-            src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-        src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+            src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+        src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
         s[7] = _mm_unpacklo_epi8(
-            _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+            _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
 
         res = convolve_lo_y(s + 0, coeffs);
         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index 12046e40c..de850ee59 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -80,7 +80,7 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
       for (i = 0; i < im_h; i += 2) {
         const __m256i row0 =
             _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
-        __m256i row1 = _mm256_set1_epi16(0);
+        __m256i row1 = _mm256_setzero_si256();
         if (i + 1 < im_h)
           row1 =
               _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
@@ -181,9 +181,9 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
           res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
           res_a_round = _mm256_max_epi16(res_a_round, zero);
 
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+          xx_storel_32(&dst[i * dst_stride + j],
                        _mm256_castsi256_si128(res_a_round));
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
                        _mm256_extracti128_si256(res_a_round, 1));
         }
 
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 37f8f42b2..de3af3ada 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -517,7 +517,7 @@ static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                             int bd, int out_shift) {
   const int32_t *sinpi = sinpi_arr(bit);
-  const __m128i zero = _mm_set1_epi32(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
   rnding = _mm_unpacklo_epi32(rnding, zero);
   const __m128i mul = _mm_set1_epi32(1 << 4);
@@ -698,7 +698,7 @@ static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   (void)bit;
   __m128i v[4];
-  __m128i zero = _mm_set1_epi32(0);
+  __m128i zero = _mm_setzero_si128();
   __m128i fact = _mm_set1_epi32(NewSqrt2);
   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
   __m128i a0_low, a1_low;
@@ -3142,7 +3142,7 @@ static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
   __m128i a0_low, a0_high, a1_low, a1_high;
-  __m128i zero = _mm_set1_epi32(0);
+  __m128i zero = _mm_setzero_si128();
   offset = _mm_unpacklo_epi32(offset, zero);
 
   for (int i = 0; i < 16; i++) {
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index 9cedd449a..da52ecdc7 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -282,7 +282,7 @@ void av1_highbd_dist_wtd_convolve_2d_avx2(
       for (i = 0; i < im_h; i += 2) {
         const __m256i row0 =
             _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
-        __m256i row1 = _mm256_set1_epi16(0);
+        __m256i row1 = _mm256_setzero_si256();
         if (i + 1 < im_h)
           row1 =
               _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c
index 87b1a66a4..7f6aceb88 100644
--- a/av1/common/x86/highbd_warp_affine_avx2.c
+++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -158,7 +158,7 @@ void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
             iy = iy * stride;
 
             __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
+                _mm256_setzero_si256(),
                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
             v_refl = _mm256_inserti128_si256(
                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
@@ -218,7 +218,7 @@ void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
                 _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
 
             __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
+                _mm256_setzero_si256(),
                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
             v_refl = _mm256_inserti128_si256(
                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
@@ -333,7 +333,7 @@ void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
             iy = iy * stride;
 
             __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
+                _mm256_setzero_si256(),
                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
             v_refl = _mm256_inserti128_si256(
                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
@@ -454,7 +454,7 @@ void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
                 _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
 
             __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
+                _mm256_setzero_si256(),
                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
             v_refl = _mm256_inserti128_si256(
                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
diff --git a/av1/common/x86/intra_edge_sse4.c b/av1/common/x86/intra_edge_sse4.c
index fc69f41d7..f025f7917 100644
--- a/av1/common/x86/intra_edge_sse4.c
+++ b/av1/common/x86/intra_edge_sse4.c
@@ -33,7 +33,7 @@ void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
 
   // Extend the first and last samples to simplify the loop for the 5-tap case
   p[-1] = p[0];
-  __m128i last = _mm_set1_epi8(p[sz - 1]);
+  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
   _mm_storeu_si128((__m128i *)&p[sz], last);
 
   // Adjust input pointer for filter support area
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index 8ea856ecd..ae8f88e7d 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -9,15 +9,19 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <emmintrin.h>
 #include <immintrin.h>
 
 #include "config/aom_dsp_rtcd.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
 #include "av1/common/convolve.h"
 
 static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
@@ -788,16 +792,311 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
   }
 }
 
+#define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3)          \
+  do {                                                                  \
+    src_0 = _mm256_cvtepu8_epi16(                                       \
+        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
+    src_1 = _mm256_cvtepu8_epi16(                                       \
+        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
+    src_2 = _mm256_cvtepu8_epi16(                                       \
+        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
+    src_3 = _mm256_cvtepu8_epi16(                                       \
+        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
+                                                                        \
+    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
+    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
+    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
+    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
+                                                                        \
+    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
+    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
+    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
+    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
+                                                                        \
+    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
+    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
+    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
+    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
+  } while (0)
+
+#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
+static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2(
+    const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
+    int w, int h, const __m256i offset_const) {
+  int i = h;
+  if (w >= 16) {
+    __m256i src_0, src_1, src_2, src_3;
+    if (w == 128) {
+      do {
+        DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
+        DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112);
+        src += 1 * src_stride;
+        dst += 1 * dst_stride;
+        i -= 1;
+      } while (i);
+    } else if (w == 64) {
+      do {
+        DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
+        src += 1 * src_stride;
+        dst += 1 * dst_stride;
+        i -= 1;
+      } while (i);
+    } else if (w == 32) {
+      do {
+        DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16);
+        src += 2 * src_stride;
+        dst += 2 * dst_stride;
+        i -= 2;
+      } while (i);
+    } else if (w == 16) {
+      do {
+        DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0);
+        src += 4 * src_stride;
+        dst += 4 * dst_stride;
+        i -= 4;
+      } while (i);
+    }
+  } else {
+    const __m256i zero = _mm256_setzero_si256();
+    do {
+      const __m128i src_row_0 =
+          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));
+      const __m128i src_row_1 =
+          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));
+      const __m128i src_row_2 =
+          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));
+      const __m128i src_row_3 =
+          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));
+
+      __m256i src_10 = _mm256_insertf128_si256(
+          _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+      __m256i src_32 = _mm256_insertf128_si256(
+          _mm256_castsi128_si256(src_row_2), src_row_3, 1);
+
+      src_10 = _mm256_unpacklo_epi8(src_10, zero);
+      src_32 = _mm256_unpacklo_epi8(src_32, zero);
+
+      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);
+      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);
+
+      src_10 = _mm256_add_epi16(src_10, offset_const);
+      src_32 = _mm256_add_epi16(src_32, offset_const);
+
+      // Accumulate values into the destination buffer
+      _mm_store_si128((__m128i *)(&dst[0 * dst_stride]),
+                      _mm256_castsi256_si128(src_10));
+      _mm_store_si128((__m128i *)(&dst[1 * dst_stride]),
+                      _mm256_extracti128_si256(src_10, 1));
+      _mm_store_si128((__m128i *)(&dst[2 * dst_stride]),
+                      _mm256_castsi256_si128(src_32));
+      _mm_store_si128((__m128i *)(&dst[3 * dst_stride]),
+                      _mm256_extracti128_si256(src_32, 1));
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      i -= 4;
+    } while (i);
+  }
+}
+
+#define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \
+  do {                                                                         \
+    src_0 = _mm256_cvtepu8_epi16(                                              \
+        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
+    src_1 = _mm256_cvtepu8_epi16(                                              \
+        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
+    src_2 = _mm256_cvtepu8_epi16(                                              \
+        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
+    src_3 = _mm256_cvtepu8_epi16(                                              \
+        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
+                                                                               \
+    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
+    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
+    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
+    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
+    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
+    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
+    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
+    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
+                                                                               \
+    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
+    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
+    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
+    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
+                                                                               \
+    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
+    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
+    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
+    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
+                                                                               \
+    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
+                              rounding_shift);                                 \
+    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
+                              rounding_shift);                                 \
+    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
+                              rounding_shift);                                 \
+    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
+                              rounding_shift);                                 \
+                                                                               \
+    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
+    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
+    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
+    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
+                                                                               \
+    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
+                    _mm256_castsi256_si128(res_10));                           \
+    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
+                    _mm256_extracti128_si256(res_10, 1));                      \
+    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
+                    _mm256_castsi256_si128(res_32));                           \
+    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
+                    _mm256_extracti128_si256(res_32, 1));                      \
+  } while (0)
+
+#define DO_AVG_2D_COPY(USE_DIST_WEIGHTED)                                     \
+  int i = h;                                                                  \
+  if (w >= 16) {                                                              \
+    __m256i src_0, src_1, src_2, src_3;                                       \
+    __m256i ref_0, ref_1, ref_2, ref_3;                                       \
+    __m256i res_0, res_1, res_2, res_3;                                       \
+    __m256i res_10, res_32;                                                   \
+    if (w == 128) {                                                           \
+      do {                                                                    \
+        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
+        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112);  \
+        i -= 1;                                                               \
+        src += 1 * src_stride;                                                \
+        dst += 1 * dst_stride;                                                \
+        dst0 += 1 * dst_stride0;                                              \
+      } while (i);                                                            \
+    } else if (w == 64) {                                                     \
+      do {                                                                    \
+        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
+                                                                              \
+        i -= 1;                                                               \
+        src += 1 * src_stride;                                                \
+        dst += 1 * dst_stride;                                                \
+        dst0 += 1 * dst_stride0;                                              \
+      } while (i);                                                            \
+    } else if (w == 32) {                                                     \
+      do {                                                                    \
+        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16);     \
+                                                                              \
+        i -= 2;                                                               \
+        src += 2 * src_stride;                                                \
+        dst += 2 * dst_stride;                                                \
+        dst0 += 2 * dst_stride0;                                              \
+      } while (i);                                                            \
+    } else {                                                                  \
+      assert(w == 16);                                                        \
+      do {                                                                    \
+        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0);       \
+                                                                              \
+        i -= 4;                                                               \
+        src += 4 * src_stride;                                                \
+        dst += 4 * dst_stride;                                                \
+        dst0 += 4 * dst_stride0;                                              \
+      } while (i);                                                            \
+    }                                                                         \
+  } else if (w == 8) {                                                        \
+    do {                                                                      \
+      const __m128i src_0 =                                                   \
+          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));                 \
+      const __m128i src_1 =                                                   \
+          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));                 \
+      const __m128i src_2 =                                                   \
+          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));                 \
+      const __m128i src_3 =                                                   \
+          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));                 \
+      __m256i src_10 =                                                        \
+          _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1);   \
+      __m256i src_32 =                                                        \
+          _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1);   \
+                                                                              \
+      src_10 = _mm256_unpacklo_epi8(src_10, zero);                            \
+      src_32 = _mm256_unpacklo_epi8(src_32, zero);                            \
+                                                                              \
+      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);                         \
+      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);                         \
+                                                                              \
+      src_10 = _mm256_add_epi16(src_10, offset_const);                        \
+      src_32 = _mm256_add_epi16(src_32, offset_const);                        \
+                                                                              \
+      const __m256i ref_10 =                                                  \
+          load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]);        \
+      const __m256i ref_32 =                                                  \
+          load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]);        \
+      __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED);    \
+      __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED);    \
+                                                                              \
+      res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const,     \
+                                 rounding_shift);                             \
+      res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const,     \
+                                 rounding_shift);                             \
+                                                                              \
+      __m256i res = _mm256_packus_epi16(res_10, res_32);                      \
+      const __m128i res_20 = _mm256_castsi256_si128(res);                     \
+      const __m128i res_31 = _mm256_extracti128_si256(res, 1);                \
+                                                                              \
+      _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20);          \
+      _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31);        \
+      _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20);          \
+      _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31);        \
+      i -= 4;                                                                 \
+      src += 4 * src_stride;                                                  \
+      dst += 4 * dst_stride;                                                  \
+      dst0 += 4 * dst_stride0;                                                \
+    } while (i);                                                              \
+  } else {                                                                    \
+    assert(w == 4);                                                           \
+    do {                                                                      \
+      __m256i src_3210_8bit =                                                 \
+          _mm256_setr_epi32(loadu_int32(src + 0 * src_stride),                \
+                            loadu_int32(src + 1 * src_stride), 0, 0,          \
+                            loadu_int32(src + 2 * src_stride),                \
+                            loadu_int32(src + 3 * src_stride), 0, 0);         \
+                                                                              \
+      __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero);           \
+      src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT);                     \
+      src_3210 = _mm256_add_epi16(src_3210, offset_const);                    \
+                                                                              \
+      __m256i ref_3210 =                                                      \
+          _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride),              \
+                             *(int64_t *)(dst + 1 * dst_stride),              \
+                             *(int64_t *)(dst + 2 * dst_stride),              \
+                             *(int64_t *)(dst + 3 * dst_stride));             \
+      __m256i res_3210 =                                                      \
+          comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED);             \
+                                                                              \
+      res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
+                                   rounding_shift);                           \
+                                                                              \
+      res_3210 = _mm256_packus_epi16(res_3210, res_3210);                     \
+      const __m128i res_10 = _mm256_castsi256_si128(res_3210);                \
+      const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1);           \
+                                                                              \
+      *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10);           \
+      *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32);           \
+      *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1);        \
+      *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1);        \
+      i -= 4;                                                                 \
+      src += 4 * src_stride;                                                  \
+      dst += 4 * dst_stride;                                                  \
+      dst0 += 4 * dst_stride0;                                                \
+    } while (i);                                                              \
+  }
+
 void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
                                         uint8_t *dst0, int dst_stride0, int w,
                                         int h, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
+  assert(conv_params->round_0 == 3);
+  assert(conv_params->round_1 == 7);
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
 
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-  const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m256i wt = unpack_weights_avx2(conv_params);
@@ -810,87 +1109,16 @@ void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  int i, j;
-
-  if (!(w % 16)) {
-    for (i = 0; i < h; i += 1) {
-      for (j = 0; j < w; j += 16) {
-        const __m256i src_16bit = _mm256_cvtepu8_epi16(
-            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
-
-        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
-        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
-
-        if (do_average) {
-          const __m256i data_ref_0 =
-              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
-
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
-
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
-
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
-                          _mm256_castsi256_si128(res_0));
-        } else {
-          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
-                             res_unsigned);
-        }
-      }
-    }
-  } else if (!(w % 4)) {
-    for (i = 0; i < h; i += 2) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i src_row_0 =
-            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
-        const __m128i src_row_1 =
-            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
-        // since not all compilers yet support _mm256_set_m128i()
-        const __m256i src_10 = _mm256_insertf128_si256(
-            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
-
-        const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
 
-        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
-
-        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m256i data_ref_0 = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
-
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-          if (w > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-          } else {
-            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
-            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
-          }
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-
-          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
-        }
-      }
+  if (do_average) {
+    if (use_dist_wtd_comp_avg) {
+      DO_AVG_2D_COPY(1)
+    } else {
+      DO_AVG_2D_COPY(0)
     }
+  } else {
+    av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride,
+                                              w, h, offset_const);
   }
 }
+#undef LEFT_SHIFT
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 581d1500a..ab937f92d 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -178,31 +178,31 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
-    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
     s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
     s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
     s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
     s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
     s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
     s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
 
     do {
       s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+          src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
       s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
 
       res = convolve_lo_y(s + 0, coeffs);
       res_shift = _mm_sll_epi32(res, left_shift);
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index a503532e0..95814b480 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c
@@ -33,13 +33,13 @@ void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
   int i = 0;
   if (4 == w) {
     do {
-      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
-      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+      const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0);
+      const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0));
       const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
       const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
 
-      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
-      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+      const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1);
+      const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1));
       const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
       const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
 
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index 0d871de71..a7fdb5a9a 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c
@@ -809,13 +809,44 @@ static void scale_plane_1_to_2_phase_0(const uint8_t *src,
   } while (--y);
 }
 
+// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling
+// in SSSE3.
+static INLINE bool has_normative_scaler_ssse3(const int src_width,
+                                              const int src_height,
+                                              const int dst_width,
+                                              const int dst_height) {
+  const bool has_normative_scaler =
+      (2 * dst_width == src_width && 2 * dst_height == src_height) ||
+      (4 * dst_width == src_width && 4 * dst_height == src_height) ||
+      (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) ||
+      (dst_width == src_width * 2 && dst_height == src_height * 2);
+
+  return has_normative_scaler;
+}
+
 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
                                        YV12_BUFFER_CONFIG *dst,
                                        const InterpFilter filter,
                                        const int phase, const int num_planes) {
+  bool has_normative_scaler =
+      has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height,
+                                 dst->y_crop_width, dst->y_crop_height);
+
+  if (num_planes > 1) {
+    has_normative_scaler =
+        has_normative_scaler &&
+        has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height,
+                                   dst->uv_crop_width, dst->uv_crop_height);
+  }
+
+  if (!has_normative_scaler) {
+    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+    return;
+  }
+
   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
   // the static analysis warnings.
-  int scaled = 0;
+  int malloc_failed = 0;
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
     const int src_w = src->crop_widths[is_uv];
@@ -828,7 +859,6 @@ void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
 
     if (2 * dst_w == src_w && 2 * dst_h == src_h) {
       // 2 to 1
-      scaled = 1;
       if (phase == 0) {
         scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
                                    dst->buffers[i], dst->strides[is_uv], dst_w,
@@ -845,22 +875,20 @@ void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
         const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        } else {
-          scaled = 0;
+        if (!temp_buffer) {
+          malloc_failed = 1;
+          break;
         }
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel[phase], temp_buffer);
+        free(temp_buffer);
       }
     } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
       // 4 to 1
-      scaled = 1;
       if (phase == 0) {
         scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
                                    dst->buffers[i], dst->strides[is_uv], dst_w,
@@ -880,18 +908,17 @@ void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
         const int extra_padding = 16;
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        } else {
-          scaled = 0;
+        if (!temp_buffer) {
+          malloc_failed = 1;
+          break;
         }
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel[phase], temp_buffer);
+        free(temp_buffer);
       }
     } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
       // 4 to 3
@@ -910,36 +937,36 @@ void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
               : 0;
       const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
       uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
-      if (temp_buffer) {
-        scaled = 1;
-        const InterpKernel *interp_kernel =
-            (const InterpKernel *)av1_interp_filter_params_list[filter]
-                .filter_ptr;
-        scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h, interp_kernel, phase, temp_buffer);
-        free(temp_buffer);
-      } else {
-        scaled = 0;
+      if (!temp_buffer) {
+        malloc_failed = 1;
+        break;
       }
-    } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
+      const InterpKernel *interp_kernel =
+          (const InterpKernel *)av1_interp_filter_params_list[filter]
+              .filter_ptr;
+      scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+                                 dst->buffers[i], dst->strides[is_uv], dst_w,
+                                 dst_h, interp_kernel, phase, temp_buffer);
+      free(temp_buffer);
+    } else {
+      assert(dst_w == src_w * 2 && dst_h == src_h * 2);
       // 1 to 2
       uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
-      if (temp_buffer) {
-        scaled = 1;
-        const InterpKernel *interp_kernel =
-            (const InterpKernel *)av1_interp_filter_params_list[filter]
-                .filter_ptr;
-        scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], src_w,
-                                   src_h, interp_kernel[8], temp_buffer);
-        free(temp_buffer);
-      } else {
-        scaled = 0;
+      if (!temp_buffer) {
+        malloc_failed = 1;
+        break;
       }
+      const InterpKernel *interp_kernel =
+          (const InterpKernel *)av1_interp_filter_params_list[filter]
+              .filter_ptr;
+      scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
+                                 dst->buffers[i], dst->strides[is_uv], src_w,
+                                 src_h, interp_kernel[8], temp_buffer);
+      free(temp_buffer);
     }
   }
-  if (!scaled) {
+
+  if (malloc_failed) {
     av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
   } else {
     aom_extend_frame_borders(dst, num_planes);
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 3c5558dda..4ab35e808 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -230,7 +230,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
 
   // Set up masks
-  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
   __m256i mask[8];
   for (int idx = 0; idx < 8; idx++) {
     const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
@@ -367,7 +367,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
 
   // Set up masks
-  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
   __m256i mask[8];
   for (int idx = 0; idx < 8; idx++) {
     const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 72c7708f1..948bbfbf0 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -181,7 +181,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
 
   // Set up masks
-  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i mask[4];
   for (int idx = 0; idx < 4; idx++) {
     const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
@@ -322,7 +322,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
 
   // Set up masks
-  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i mask[4];
   for (int idx = 0; idx < 4; idx++) {
     const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index b0c9a9385..ceb836ea6 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -1028,12 +1028,12 @@ int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
   int64_t sum_error = 0;
   int i, j;
   __m256i row_error, col_error;
-  __m256i zero = _mm256_set1_epi16(0);
+  __m256i zero = _mm256_setzero_si256();
   __m256i dup_255 = _mm256_set1_epi16(255);
   col_error = zero;
 
   for (i = 0; i < (p_height / 4); i++) {
-    row_error = _mm256_set1_epi16(0);
+    row_error = _mm256_setzero_si256();
     for (j = 0; j < (p_width / 16); j++) {
       __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
           (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 6ff666518..f8fe578e9 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -21,7 +21,7 @@ int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
   int64_t sum_error = 0;
   int i, j;
   __m128i row_error, col_error;
-  __m128i zero = _mm_set1_epi16(0);
+  __m128i zero = _mm_setzero_si128();
   __m128i dup_255 = _mm_set1_epi16(255);
   col_error = zero;
   for (i = 0; i < (p_height); i++) {
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 8e7ffced1..53275eaaa 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -377,8 +377,8 @@ static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
   if (bsize >= BLOCK_8X8 &&
       (seq_params->subsampling_x || seq_params->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
-        ss_size_lookup[bsize][seq_params->subsampling_x]
-                      [seq_params->subsampling_y];
+        av1_ss_size_lookup[bsize][seq_params->subsampling_x]
+                          [seq_params->subsampling_y];
     if (uv_subsize == BLOCK_INVALID)
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid block size.");
@@ -543,13 +543,11 @@ static INLINE void extend_mc_border(const struct scale_factors *const sf,
   }
 }
 
-static void dec_calc_subpel_params(const MV *const src_mv,
-                                   InterPredParams *const inter_pred_params,
-                                   const MACROBLOCKD *const xd, int mi_x,
-                                   int mi_y, uint8_t **pre,
-                                   SubpelParams *subpel_params, int *src_stride,
-                                   PadBlock *block, MV32 *scaled_mv,
-                                   int *subpel_x_mv, int *subpel_y_mv) {
+static AOM_INLINE void dec_calc_subpel_params(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    const MACROBLOCKD *const xd, int mi_x, int mi_y, uint8_t **pre,
+    SubpelParams *subpel_params, int *src_stride, PadBlock *block,
+    MV32 *scaled_mv, int *subpel_x_mv, int *subpel_y_mv) {
   const struct scale_factors *sf = inter_pred_params->scale_factors;
   struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
   const int bw = inter_pred_params->block_width;
@@ -562,8 +560,8 @@ static void dec_calc_subpel_params(const MV *const src_mv,
     orig_pos_y += src_mv->row * (1 << (1 - ssy));
     int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
     orig_pos_x += src_mv->col * (1 << (1 - ssx));
-    int pos_y = sf->scale_value_y(orig_pos_y, sf);
-    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    int pos_y = av1_scaled_y(orig_pos_y, sf);
+    int pos_x = av1_scaled_x(orig_pos_x, sf);
     pos_x += SCALE_EXTRA_OFF;
     pos_y += SCALE_EXTRA_OFF;
 
@@ -631,7 +629,7 @@ static void dec_calc_subpel_params(const MV *const src_mv,
   *src_stride = pre_buf->stride;
 }
 
-static void dec_calc_subpel_params_and_extend(
+static AOM_INLINE void dec_calc_subpel_params_and_extend(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
     MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf,
     uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
@@ -648,14 +646,17 @@ static void dec_calc_subpel_params_and_extend(
       inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
 }
 
+#define IS_DEC 1
+#include "av1/common/reconinter_template.inc"
+#undef IS_DEC
+
 static void dec_build_inter_predictors(const AV1_COMMON *cm,
                                        DecoderCodingBlock *dcb, int plane,
                                        const MB_MODE_INFO *mi,
                                        int build_for_obmc, int bw, int bh,
                                        int mi_x, int mi_y) {
-  av1_build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh,
-                             mi_x, mi_y, dcb->mc_buf,
-                             dec_calc_subpel_params_and_extend);
+  build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh, mi_x,
+                         mi_y, dcb->mc_buf);
 }
 
 static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
@@ -3470,12 +3471,11 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
     CHECK_MEM_ERROR(cm, pbi->thread_data,
-                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+                    aom_calloc(num_threads, sizeof(*pbi->thread_data)));
 
     for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
       AVxWorker *const worker = &pbi->tile_workers[worker_idx];
       DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-      ++pbi->num_workers;
 
       winterface->init(worker);
       worker->thread_name = "aom tile worker";
@@ -3483,6 +3483,7 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
         aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
+      ++pbi->num_workers;
 
       if (worker_idx != 0) {
         // Allocate thread data.
@@ -5280,6 +5281,9 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
         cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+    // Frame border extension is not required in the decoder
+    // as it happens in extend_mc_border().
+    int do_extend_border_mt = 0;
     if (!optimized_loop_restoration) {
       if (do_loop_restoration)
         av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
@@ -5289,7 +5293,8 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
         if (pbi->num_workers > 1) {
           av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
                             pbi->tile_workers, &pbi->cdef_sync,
-                            pbi->num_workers, av1_cdef_init_fb_row_mt);
+                            pbi->num_workers, av1_cdef_init_fb_row_mt,
+                            do_extend_border_mt);
         } else {
           av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
                          av1_cdef_init_fb_row);
@@ -5305,7 +5310,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
           av1_loop_restoration_filter_frame_mt(
               (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
               pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
-              &pbi->lr_ctxt);
+              &pbi->lr_ctxt, do_extend_border_mt);
         } else {
           av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
                                             cm, optimized_loop_restoration,
@@ -5320,7 +5325,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
           av1_loop_restoration_filter_frame_mt(
               (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
               pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
-              &pbi->lr_ctxt);
+              &pbi->lr_ctxt, do_extend_border_mt);
         } else {
           av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
                                             cm, optimized_loop_restoration,
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 6e7291d4c..5f114f9cb 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -279,7 +279,7 @@ int av1_neg_deinterleave(int diff, int ref, int max) {
 static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
                            aom_reader *r, int skip) {
   int cdf_num;
-  const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0);
+  const uint8_t pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0);
   if (skip) return pred;
 
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -310,16 +310,6 @@ static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
   return segment_id;
 }
 
-static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
-                           int segment_id) {
-  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
-
-  for (int y = 0; y < y_mis; y++)
-    for (int x = 0; x < x_mis; x++)
-      cm->cur_frame->seg_map[mi_offset + y * cm->mi_params.mi_cols + x] =
-          segment_id;
-}
-
 static int read_intra_segment_id(AV1_COMMON *const cm,
                                  const MACROBLOCKD *const xd, int bsize,
                                  aom_reader *r, int skip) {
@@ -330,13 +320,15 @@ static int read_intra_segment_id(AV1_COMMON *const cm,
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+  const int mi_stride = cm->mi_params.mi_cols;
+  const int mi_offset = mi_row * mi_stride + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
   const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
   const int segment_id = read_segment_id(cm, xd, r, skip);
-  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
+                 segment_id);
   return segment_id;
 }
 
@@ -344,12 +336,20 @@ static void copy_segment_id(const CommonModeInfoParams *const mi_params,
                             const uint8_t *last_segment_ids,
                             uint8_t *current_segment_ids, int mi_offset,
                             int x_mis, int y_mis) {
-  for (int y = 0; y < y_mis; y++)
-    for (int x = 0; x < x_mis; x++)
-      current_segment_ids[mi_offset + y * mi_params->mi_cols + x] =
-          last_segment_ids
-              ? last_segment_ids[mi_offset + y * mi_params->mi_cols + x]
-              : 0;
+  const int stride = mi_params->mi_cols;
+  if (last_segment_ids) {
+    assert(last_segment_ids != current_segment_ids);
+    for (int y = 0; y < y_mis; y++) {
+      memcpy(&current_segment_ids[mi_offset + y * stride],
+             &last_segment_ids[mi_offset + y * stride],
+             sizeof(current_segment_ids[0]) * x_mis);
+    }
+  } else {
+    for (int y = 0; y < y_mis; y++) {
+      memset(&current_segment_ids[mi_offset + y * stride], 0,
+             sizeof(current_segment_ids[0]) * x_mis);
+    }
+  }
 }
 
 static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
@@ -382,7 +382,8 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
     return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
   }
 
-  int segment_id;
+  uint8_t segment_id;
+  const int mi_stride = cm->mi_params.mi_cols;
   if (preskip) {
     if (!seg->segid_preskip) return 0;
   } else {
@@ -391,13 +392,14 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
         mbmi->seg_id_predicted = 0;
       }
       segment_id = read_segment_id(cm, xd, r, 1);
-      set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+      set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
+                     segment_id);
       return segment_id;
     }
   }
 
   if (seg->temporal_update) {
-    const int ctx = av1_get_pred_context_seg_id(xd);
+    const uint8_t ctx = av1_get_pred_context_seg_id(xd);
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     struct segmentation_probs *const segp = &ec_ctx->seg;
     aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
@@ -410,7 +412,8 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   } else {
     segment_id = read_segment_id(cm, xd, r, 0);
   }
-  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
+                 segment_id);
   return segment_id;
 }
 
@@ -1204,7 +1207,9 @@ static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
                          .as_int;
       break;
     }
-    default: { return 0; }
+    default: {
+      return 0;
+    }
   }
 
   int ret = is_mv_valid(&mv[0].as_mv);
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index 2553ffb79..4559dc62e 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -58,8 +58,8 @@ static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width,
   mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
   mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
 
-  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
-  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
+  mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
   mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
 
   mi_params->mi_alloc_bsize = BLOCK_4X4;
@@ -183,10 +183,12 @@ void av1_decoder_remove(AV1Decoder *pbi) {
   aom_free(pbi->lf_worker.data1);
 
   if (pbi->thread_data) {
-    for (int worker_idx = 1; worker_idx < pbi->max_threads; worker_idx++) {
+    for (int worker_idx = 1; worker_idx < pbi->num_workers; worker_idx++) {
       DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-      av1_free_mc_tmp_buf(thread_data->td);
-      aom_free(thread_data->td);
+      if (thread_data->td != NULL) {
+        av1_free_mc_tmp_buf(thread_data->td);
+        aom_free(thread_data->td);
+      }
     }
     aom_free(pbi->thread_data);
   }
@@ -225,6 +227,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
 #endif
   av1_free_mc_tmp_buf(&pbi->td);
   aom_img_metadata_array_free(pbi->metadata);
+  av1_remove_common(&pbi->common);
   aom_free(pbi);
 }
 
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index b0a9e24d8..cfc32702a 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -22,22 +22,17 @@
 #include "av1/common/reconinter.h"
 #include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/rdopt_utils.h"
 
-// Process the wiener variance in 16x16 block basis.
-static int qsort_comp(const void *elem1, const void *elem2) {
-  int a = *((const int *)elem1);
-  int b = *((const int *)elem2);
-  if (a > b) return 1;
-  if (a < b) return -1;
-  return 0;
-}
-
 void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
 
+  // This block size is also used to determine number of workers in
+  // multi-threading. If it is changed, one needs to change it accordingly in
+  // "compute_num_ai_workers()".
   cpi->weber_bsize = BLOCK_8X8;
 
   if (cpi->mb_weber_stats) return;
@@ -202,121 +197,202 @@ static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize,
   return sb_wiener_var;
 }
 
-static double calc_src_mean_var(const uint8_t *const src_buffer,
-                                const int buf_stride, const int block_size,
-                                const int use_hbd, double *mean) {
-  double src_mean = 0.0;
-  double src_variance = 0.0;
-  for (int pix_row = 0; pix_row < block_size; ++pix_row) {
-    for (int pix_col = 0; pix_col < block_size; ++pix_col) {
-      int src_pix;
-      if (use_hbd) {
-        const uint16_t *src = CONVERT_TO_SHORTPTR(src_buffer);
-        src_pix = src[pix_row * buf_stride + pix_col];
-      } else {
-        src_pix = src_buffer[pix_row * buf_stride + pix_col];
-      }
-      src_mean += src_pix;
-      src_variance += src_pix * src_pix;
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+                                MACROBLOCKD *xd, const int mi_row,
+                                int16_t *src_diff, tran_low_t *coeff,
+                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                double *sum_rec_distortion,
+                                double *sum_est_rate) {
+  AV1_COMMON *const cm = &cpi->common;
+  uint8_t *buffer = cpi->source->y_buffer;
+  int buf_stride = cpi->source->y_stride;
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int block_size = tx_size_wide[tx_size];
+  const int coeff_count = block_size * block_size;
+  const int mb_step = mi_size_wide[bsize];
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  const AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  // We allocate cpi->tile_data (of size 1) when we call this function in
+  // multithreaded mode, so cpi->tile_data may be a null pointer when we call
+  // this function in single-threaded mode.
+  AV1EncRowMultiThreadSync *const row_mt_sync =
+      cpi->tile_data ? &cpi->tile_data[0].row_mt_sync : NULL;
+  const int mi_cols = cm->mi_params.mi_cols;
+  const int mt_thread_id = mi_row / mb_step;
+  // TODO(chengchen): test different unit step size
+  const int mt_unit_step = mi_size_wide[BLOCK_64X64];
+  const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+  int mt_unit_col = 0;
+
+  for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) {
+    if (mi_col % mt_unit_step == 0) {
+      enc_row_mt->sync_read_ptr(row_mt_sync, mt_thread_id, mt_unit_col);
     }
-  }
-  const int pix_num = block_size * block_size;
-  src_variance -= (src_mean * src_mean) / pix_num;
-  src_variance /= pix_num;
-  *mean = src_mean / pix_num;
-  return src_variance;
-}
 
-static BLOCK_SIZE pick_block_size(AV1_COMP *cpi,
-                                  const BLOCK_SIZE orig_block_size) {
-  const BLOCK_SIZE sub_block_size =
-      get_partition_subsize(orig_block_size, PARTITION_SPLIT);
-  const int mb_step = mi_size_wide[orig_block_size];
-  const int sub_step = mb_step >> 1;
-  const TX_SIZE tx_size = max_txsize_lookup[orig_block_size];
-  const int block_size = tx_size_wide[tx_size];
-  const int split_block_size = block_size >> 1;
-  assert(split_block_size >= 8);
-  const uint8_t *const buffer = cpi->source->y_buffer;
-  const int buf_stride = cpi->source->y_stride;
-  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+    PREDICTION_MODE best_mode = DC_PRED;
+    int best_intra_cost = INT_MAX;
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                          mi_row, mi_col);
+    set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                   cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+    set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+                 av1_num_planes(cm));
+    xd->mi[0]->bsize = bsize;
+    xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+    av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col,
+                         0, av1_num_planes(cm));
+    int dst_buffer_stride = xd->plane[0].dst.stride;
+    uint8_t *dst_buffer = xd->plane[0].dst.buf;
+    uint8_t *mb_buffer =
+        buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+    for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
+         ++mode) {
+      av1_predict_intra_block(xd, cm->seq_params->sb_size,
+                              cm->seq_params->enable_intra_edge_filter,
+                              block_size, block_size, tx_size, mode, 0, 0,
+                              FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                              dst_buffer, dst_buffer_stride, 0, 0, 0);
+      av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+      av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+      int intra_cost = aom_satd(coeff, coeff_count);
+      if (intra_cost < best_intra_cost) {
+        best_intra_cost = intra_cost;
+        best_mode = mode;
+      }
+    }
 
-  double vote = 0.0;
-  for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
-    for (int mi_col = 0; mi_col < cpi->frame_info.mi_cols; mi_col += mb_step) {
-      const uint8_t *mb_buffer =
-          buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
-      // (1). Calculate mean and var using the original block size
-      double mean = 0.0;
-      const double orig_var =
-          calc_src_mean_var(mb_buffer, buf_stride, block_size, use_hbd, &mean);
-      // (2). Calculate mean and var using the split block size
-      double split_var[4] = { 0 };
-      double split_mean[4] = { 0 };
-      int sub_idx = 0;
-      for (int row = mi_row; row < mi_row + mb_step; row += sub_step) {
-        for (int col = mi_col; col < mi_col + mb_step; col += sub_step) {
-          mb_buffer = buffer + row * MI_SIZE * buf_stride + col * MI_SIZE;
-          split_var[sub_idx] =
-              calc_src_mean_var(mb_buffer, buf_stride, split_block_size,
-                                use_hbd, &split_mean[sub_idx]);
-          ++sub_idx;
+    av1_predict_intra_block(
+        xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+        block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES,
+        dst_buffer, dst_buffer_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+    av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+                       mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+    av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+
+    const struct macroblock_plane *const p = &x->plane[0];
+    uint16_t eob;
+    const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+    QUANT_PARAM quant_param;
+    int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+    av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                                    scan_order, &quant_param);
+    } else {
+      av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                             scan_order, &quant_param);
+    }
+#else
+    av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order,
+                           &quant_param);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+    av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
+                                dst_buffer_stride, eob, 0);
+    WeberStats *weber_stats =
+        &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+                             (mi_col / mb_step)];
+
+    weber_stats->rec_pix_max = 1;
+    weber_stats->rec_variance = 0;
+    weber_stats->src_pix_max = 1;
+    weber_stats->src_variance = 0;
+    weber_stats->distortion = 0;
+
+    int64_t src_mean = 0;
+    int64_t rec_mean = 0;
+    int64_t dist_mean = 0;
+
+    for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+      for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+        int src_pix, rec_pix;
+#if CONFIG_AV1_HIGHBITDEPTH
+        if (is_cur_buf_hbd(xd)) {
+          uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
+          uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
+          src_pix = src[pix_row * buf_stride + pix_col];
+          rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
+        } else {
+          src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+          rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
         }
+#else
+        src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+        rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+#endif
+        src_mean += src_pix;
+        rec_mean += rec_pix;
+        dist_mean += src_pix - rec_pix;
+        weber_stats->src_variance += src_pix * src_pix;
+        weber_stats->rec_variance += rec_pix * rec_pix;
+        weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
+        weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
+        weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
       }
-      // (3). Determine whether to use the original or the split block size.
-      // If use original, vote += 1.0.
-      // If use split, vote -= 1.0.
-      double max_split_mean = 0.0;
-      double max_split_var = 0.0;
-      double geo_split_var = 0.0;
-      for (int i = 0; i < 4; ++i) {
-        max_split_mean = AOMMAX(max_split_mean, split_mean[i]);
-        max_split_var = AOMMAX(max_split_var, split_var[i]);
-        geo_split_var += log(0.1 + split_var[i]);
-      }
-      geo_split_var = exp(geo_split_var / 4);
-      const double param_1 = 1.5;
-      const double param_2 = 1.0;
-      // If the variance of the large block size is considerably larger than the
-      // geometric mean of vars of small blocks;
-      // Or if the variance of the large block size is larger than the local
-      // variance;
-      // Or if the variance of the large block size is considerably larger
-      // than the mean.
-      // It indicates that the source block is not a flat area, therefore we
-      // might want to split into smaller block sizes to capture the
-      // local characteristics.
-      if (orig_var > param_1 * geo_split_var || orig_var > max_split_var ||
-          sqrt(orig_var) > param_2 * mean) {
-        vote -= 1.0;
-      } else {
-        vote += 1.0;
-      }
+    }
+
+    if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+      *sum_rec_distortion += weber_stats->distortion;
+      int est_block_rate = 0;
+      int64_t est_block_dist = 0;
+      model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion,
+                                      pix_num, &est_block_rate,
+                                      &est_block_dist);
+      *sum_est_rate += est_block_rate;
+    }
+
+    weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
+    weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
+    weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
+    weber_stats->satd = best_intra_cost;
+
+    qcoeff[0] = 0;
+    int max_scale = 0;
+    for (int idx = 1; idx < coeff_count; ++idx) {
+      const int abs_qcoeff = abs(qcoeff[idx]);
+      max_scale = AOMMAX(max_scale, abs_qcoeff);
+    }
+    weber_stats->max_scale = max_scale;
+
+    if ((mi_col + mb_step) % mt_unit_step == 0 ||
+        (mi_col + mb_step) >= mi_cols) {
+      enc_row_mt->sync_write_ptr(row_mt_sync, mt_thread_id, mt_unit_col,
+                                 mt_unit_cols);
+      ++mt_unit_col;
     }
   }
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  xd->mi = NULL;
+}
 
-  return vote > 0.0 ? orig_block_size : sub_block_size;
+static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
+                               double *sum_est_rate) {
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+  for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+    av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
+                               dqcoeff, sum_rec_distortion, sum_est_rate);
+  }
 }
 
-static int64_t pick_norm_factor_and_block_size(AV1_COMP *const cpi,
-                                               BLOCK_SIZE *best_block_size) {
+static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi,
+                                        const BLOCK_SIZE norm_block_size) {
   const AV1_COMMON *const cm = &cpi->common;
-  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
-  BLOCK_SIZE last_block_size;
-  BLOCK_SIZE this_block_size = sb_size;
-  *best_block_size = sb_size;
-  // Pick from block size 128x128, 64x64, 32x32 and 16x16.
-  do {
-    last_block_size = this_block_size;
-    assert(this_block_size >= BLOCK_16X16 && this_block_size <= BLOCK_128X128);
-    const int block_size = block_size_wide[this_block_size];
-    if (block_size < 32) break;
-    this_block_size = pick_block_size(cpi, last_block_size);
-  } while (this_block_size != last_block_size);
-  *best_block_size = this_block_size;
-
   int64_t norm_factor = 1;
-  const BLOCK_SIZE norm_block_size = this_block_size;
   assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128);
   const int norm_step = mi_size_wide[norm_block_size];
   double sb_wiener_log = 0;
@@ -366,16 +442,6 @@ static void automatic_intra_tools_off(AV1_COMP *cpi,
 
 void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  uint8_t *buffer = cpi->source->y_buffer;
-  int buf_stride = cpi->source->y_stride;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO mbmi;
-  memset(&mbmi, 0, sizeof(mbmi));
-  MB_MODE_INFO *mbmi_ptr = &mbmi;
-  xd->mi = &mbmi_ptr;
-
   const SequenceHeader *const seq_params = cm->seq_params;
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
@@ -384,173 +450,44 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
           NULL, cpi->oxcf.tool_cfg.enable_global_motion, 0))
     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
+  cpi->norm_wiener_variance = 0;
 
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  // xd->mi needs to be setup since it is used in av1_frame_init_quantizer.
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
   cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
   av1_frame_init_quantizer(cpi);
 
-  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
-  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
-  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
-  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
-
-  int mi_row, mi_col;
-
-  BLOCK_SIZE bsize = cpi->weber_bsize;
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int block_size = tx_size_wide[tx_size];
-  const int coeff_count = block_size * block_size;
-
-  const BitDepthInfo bd_info = get_bit_depth_info(xd);
-  cpi->norm_wiener_variance = 0;
-  int mb_step = mi_size_wide[bsize];
-
   double sum_rec_distortion = 0.0;
   double sum_est_rate = 0.0;
-  for (mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
-    for (mi_col = 0; mi_col < cpi->frame_info.mi_cols; mi_col += mb_step) {
-      PREDICTION_MODE best_mode = DC_PRED;
-      int best_intra_cost = INT_MAX;
-
-      xd->up_available = mi_row > 0;
-      xd->left_available = mi_col > 0;
-
-      const int mi_width = mi_size_wide[bsize];
-      const int mi_height = mi_size_high[bsize];
-      set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
-                            mi_row, mi_col);
-      set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
-                     cm->mi_params.mi_rows, cm->mi_params.mi_cols);
-      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
-                   av1_num_planes(cm));
-      xd->mi[0]->bsize = bsize;
-      xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
-
-      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
-                           mi_col, 0, av1_num_planes(cm));
-
-      int dst_buffer_stride = xd->plane[0].dst.stride;
-      uint8_t *dst_buffer = xd->plane[0].dst.buf;
-      uint8_t *mb_buffer =
-          buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
-
-      for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
-           ++mode) {
-        av1_predict_intra_block(
-            xd, cm->seq_params->sb_size,
-            cm->seq_params->enable_intra_edge_filter, block_size, block_size,
-            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
-            dst_buffer_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
-
-        av1_subtract_block(bd_info, block_size, block_size, src_diff,
-                           block_size, mb_buffer, buf_stride, dst_buffer,
-                           dst_buffer_stride);
-        av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
-        int intra_cost = aom_satd(coeff, coeff_count);
-        if (intra_cost < best_intra_cost) {
-          best_intra_cost = intra_cost;
-          best_mode = mode;
-        }
-      }
-
-      int idx;
-      av1_predict_intra_block(xd, cm->seq_params->sb_size,
-                              cm->seq_params->enable_intra_edge_filter,
-                              block_size, block_size, tx_size, best_mode, 0, 0,
-                              FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
-                              dst_buffer, dst_buffer_stride, 0, 0, 0);
-      av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
-                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
-      av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
-
-      const struct macroblock_plane *const p = &x->plane[0];
-      uint16_t eob;
-      const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
-      QUANT_PARAM quant_param;
-      int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
-      av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
-#if CONFIG_AV1_HIGHBITDEPTH
-      if (is_cur_buf_hbd(xd)) {
-        av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
-                                      scan_order, &quant_param);
-      } else {
-        av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
-                               scan_order, &quant_param);
-      }
-#else
-      av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
-                             scan_order, &quant_param);
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-      av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
-                                  dst_buffer_stride, eob, 0);
-      WeberStats *weber_stats =
-          &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
-                               (mi_col / mb_step)];
-
-      weber_stats->rec_pix_max = 1;
-      weber_stats->rec_variance = 0;
-      weber_stats->src_pix_max = 1;
-      weber_stats->src_variance = 0;
-      weber_stats->distortion = 0;
-
-      int64_t src_mean = 0;
-      int64_t rec_mean = 0;
-      int64_t dist_mean = 0;
-
-      for (int pix_row = 0; pix_row < block_size; ++pix_row) {
-        for (int pix_col = 0; pix_col < block_size; ++pix_col) {
-          int src_pix, rec_pix;
-#if CONFIG_AV1_HIGHBITDEPTH
-          if (is_cur_buf_hbd(xd)) {
-            uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
-            uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
-            src_pix = src[pix_row * buf_stride + pix_col];
-            rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
-          } else {
-            src_pix = mb_buffer[pix_row * buf_stride + pix_col];
-            rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
-          }
-#else
-          src_pix = mb_buffer[pix_row * buf_stride + pix_col];
-          rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
-#endif
-          src_mean += src_pix;
-          rec_mean += rec_pix;
-          dist_mean += src_pix - rec_pix;
-          weber_stats->src_variance += src_pix * src_pix;
-          weber_stats->rec_variance += rec_pix * rec_pix;
-          weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
-          weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
-          weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
-        }
-      }
-
-      sum_rec_distortion += weber_stats->distortion;
-      int est_block_rate = 0;
-      int64_t est_block_dist = 0;
-      model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion,
-                                      pix_num, &est_block_rate,
-                                      &est_block_dist);
-      sum_est_rate += est_block_rate;
-
-      weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
-      weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
-      weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
-      weber_stats->satd = best_intra_cost;
-
-      qcoeff[0] = 0;
-      for (idx = 1; idx < coeff_count; ++idx) qcoeff[idx] = abs(qcoeff[idx]);
-      qsort(qcoeff, coeff_count, sizeof(*coeff), qsort_comp);
 
-      weber_stats->max_scale = (double)qcoeff[coeff_count - 1];
-    }
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+  // Calculate differential contrast for each block for the entire image.
+  // TODO(aomedia:3376): Remove " && 0" when there are no data races in
+  // av1_calc_mb_wiener_var_mt(). See also bug aomedia:3380.
+  if (num_workers > 1 && 0) {
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+    av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion,
+                              &sum_est_rate);
+  } else {
+    calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate);
   }
 
   // Determine whether to turn off several intra coding tools.
   automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate);
 
-  BLOCK_SIZE norm_block_size = BLOCK_16X16;
-  cpi->norm_wiener_variance =
-      pick_norm_factor_and_block_size(cpi, &norm_block_size);
+  const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size;
+  cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size);
   const int norm_step = mi_size_wide[norm_block_size];
 
   double sb_wiener_log = 0;
@@ -558,18 +495,21 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
   for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
     sb_wiener_log = 0;
     sb_count = 0;
-    for (mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
-      for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) {
+    for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+      for (int mi_col = 0; mi_col < cm->mi_params.mi_cols;
+           mi_col += norm_step) {
         int sb_wiener_var =
             get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
 
         double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
         double min_max_scale = AOMMAX(
             1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
-        beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+
         beta = AOMMIN(beta, 4);
         beta = AOMMAX(beta, 0.25);
 
+        if (beta < 1 / min_max_scale) continue;
+
         sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
 
         int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
@@ -585,6 +525,8 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
     cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
   }
 
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  xd->mi = NULL;
   aom_free_frame_buffer(&cm->cur_frame->buf);
 }
 
diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h
index 6f60cdb6a..9e10566dd 100644
--- a/av1/encoder/allintra_vis.h
+++ b/av1/encoder/allintra_vis.h
@@ -22,6 +22,13 @@
 
 void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
 
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+                                MACROBLOCKD *xd, const int mi_row,
+                                int16_t *src_diff, tran_low_t *coeff,
+                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                double *sum_rec_distortion,
+                                double *sum_est_rate);
+
 void av1_set_mb_wiener_variance(AV1_COMP *cpi);
 
 int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 37bc309ec..4cf6bd572 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c
@@ -104,9 +104,8 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
       if (segment == DEFAULT_AQ2_SEG) continue;
 
       qindex_delta = av1_compute_qdelta_by_rate(
-          &cpi->rc, cm->current_frame.frame_type, base_qindex,
-          aq_c_q_adj_factor[aq_strength][segment], cpi->is_screen_content_type,
-          cm->seq_params->bit_depth);
+          cpi, cm->current_frame.frame_type, base_qindex,
+          aq_c_q_adj_factor[aq_strength][segment]);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -137,48 +136,40 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
   const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col;
   const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]);
   const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]);
-  int x, y;
   int i;
   unsigned char segment;
 
-  if (0) {
-    segment = DEFAULT_AQ2_SEG;
-  } else {
-    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
-    // It is converted to bits << AV1_PROB_COST_SHIFT units.
-    const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
-                        << AV1_PROB_COST_SHIFT;
-    const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
-    const int target_rate = (int)(num / denom);
-    double logvar;
-    double low_var_thresh;
-    const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
-                                              cm->seq_params->bit_depth);
-
-    low_var_thresh = (is_stat_consumption_stage_twopass(cpi))
-                         ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy),
-                                  MIN_DEFAULT_LV_THRESH)
-                         : DEFAULT_LV_THRESH;
-
-    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
-    logvar = av1_log_block_var(cpi, mb, bs);
-
-    segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
-    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
-      // Test rate against a threshold value and variance against a threshold.
-      // Increasing segment number (higher variance and complexity) = higher Q.
-      if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
-          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
-        segment = i;
-        break;
-      }
+  // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+  // It is converted to bits << AV1_PROB_COST_SHIFT units.
+  const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+                      << AV1_PROB_COST_SHIFT;
+  const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
+  const int target_rate = (int)(num / denom);
+  double logvar;
+  double low_var_thresh;
+  const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
+                                            cm->seq_params->bit_depth);
+
+  low_var_thresh =
+      (is_stat_consumption_stage_twopass(cpi))
+          ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+          : DEFAULT_LV_THRESH;
+
+  av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
+  logvar = av1_log_block_var(cpi, mb, bs);
+
+  segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
+  for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+    // Test rate against a threshold value and variance against a threshold.
+    // Increasing segment number (higher variance and complexity) = higher Q.
+    if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+        (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+      segment = i;
+      break;
     }
   }
 
   // Fill in the entires in the segment map corresponding to this SB64.
-  for (y = 0; y < ymis; y++) {
-    for (x = 0; x < xmis; x++) {
-      cpi->enc_seg.map[mi_offset + y * cm->mi_params.mi_cols + x] = segment;
-    }
-  }
+  const int mi_stride = cm->mi_params.mi_cols;
+  set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment);
 }
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 89c4ac33c..616d52f2e 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -26,6 +26,8 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
 
   cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
   cr->counter_encode_maxq_scene_change = 0;
+  cr->percent_refresh_adjustment = 5;
+  cr->rate_ratio_qdelta_adjustment = 0.25;
   if (cr->map == NULL) {
     av1_cyclic_refresh_free(cr);
     return NULL;
@@ -73,10 +75,8 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
 // Compute delta-q for the segment.
 static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  const RATE_CONTROL *const rc = &cpi->rc;
   int deltaq = av1_compute_qdelta_by_rate(
-      rc, cpi->common.current_frame.frame_type, q, rate_factor,
-      cpi->is_screen_content_type, cpi->common.seq_params->bit_depth);
+      cpi, cpi->common.current_frame.frame_type, q, rate_factor);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
   }
@@ -86,9 +86,7 @@ static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
 int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
                                           double correction_factor) {
   const AV1_COMMON *const cm = &cpi->common;
-  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
   const int base_qindex = cm->quant_params.base_qindex;
-  const int bit_depth = cm->seq_params->bit_depth;
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int mbs = cm->mi_params.MBs;
   const int num4x4bl = mbs << 4;
@@ -105,17 +103,13 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
   // Take segment weighted average for estimated bits.
   const int estimated_bits =
       (int)((1.0 - weight_segment1 - weight_segment2) *
-                av1_estimate_bits_at_q(frame_type, base_qindex, mbs,
-                                       correction_factor, bit_depth,
-                                       cpi->is_screen_content_type) +
-            weight_segment1 * av1_estimate_bits_at_q(
-                                  frame_type, base_qindex + cr->qindex_delta[1],
-                                  mbs, correction_factor, bit_depth,
-                                  cpi->is_screen_content_type) +
-            weight_segment2 * av1_estimate_bits_at_q(
-                                  frame_type, base_qindex + cr->qindex_delta[2],
-                                  mbs, correction_factor, bit_depth,
-                                  cpi->is_screen_content_type));
+                av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) +
+            weight_segment1 *
+                av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1],
+                                       correction_factor) +
+            weight_segment2 *
+                av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2],
+                                       correction_factor));
   return estimated_bits;
 }
 
@@ -141,21 +135,21 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
   }
   // Compute delta-q corresponding to qindex i.
   int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate;
   // Take segment weighted average for bits per mb.
   bits_per_mb =
       (int)((1.0 - weight_segment) *
-                av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
-                                   correction_factor, cm->seq_params->bit_depth,
-                                   cpi->is_screen_content_type) +
-            weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type,
-                                                i + deltaq, correction_factor,
-                                                cm->seq_params->bit_depth,
-                                                cpi->is_screen_content_type));
+                av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i,
+                                   correction_factor, accurate_estimate) +
+            weight_segment * av1_rc_bits_per_mb(
+                                 cpi, cm->current_frame.frame_type, i + deltaq,
+                                 correction_factor, accurate_estimate));
   return bits_per_mb;
 }
 
 void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                   RUN_TYPE dry_run) {
   int cdf_num;
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -174,21 +168,22 @@ void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
         av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4);
     if (prev_segment_id != mbmi->segment_id) {
       const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+      const int mi_stride = cm->mi_params.mi_cols;
+      const uint8_t segment_id = mbmi->segment_id;
       for (int mi_y = 0; mi_y < ymis; mi_y++) {
-        for (int mi_x = 0; mi_x < xmis; mi_x++) {
-          const int map_offset =
-              block_index + mi_y * cm->mi_params.mi_cols + mi_x;
-          cr->map[map_offset] = 0;
-          cpi->enc_seg.map[map_offset] = mbmi->segment_id;
-          cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
-        }
+        const int map_offset = block_index + mi_y * mi_stride;
+        memset(&cr->map[map_offset], 0, xmis);
+        memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+        memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
       }
     }
   }
-  if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
-    x->actual_num_seg1_blocks -= xmis * ymis;
-  else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
-    x->actual_num_seg2_blocks -= xmis * ymis;
+  if (!dry_run) {
+    if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks -= xmis * ymis;
+    else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks -= xmis * ymis;
+  }
 }
 
 void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
@@ -219,12 +214,13 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
     // Reset segment_id if will be skipped.
     if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
   }
+  const uint8_t segment_id = mbmi->segment_id;
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+  if (cyclic_refresh_segment_id_boosted(segment_id)) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -238,30 +234,19 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  if (sh == 1) {
-    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
-      const int map_offset = block_index + mi_y * cm->mi_params.mi_cols;
-      memset(&cr->map[map_offset], new_map_value, xmis);
-      memset(&cpi->enc_seg.map[map_offset], mbmi->segment_id, xmis);
-      memset(&cm->cur_frame->seg_map[map_offset], mbmi->segment_id, xmis);
-    }
-  } else {
-    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
-      for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
-        const int map_offset =
-            block_index + mi_y * cm->mi_params.mi_cols + mi_x;
-        cr->map[map_offset] = new_map_value;
-        cpi->enc_seg.map[map_offset] = mbmi->segment_id;
-        cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
-      }
-    }
+  const int mi_stride = cm->mi_params.mi_cols;
+  for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+    const int map_offset = block_index + mi_y * mi_stride;
+    memset(&cr->map[map_offset], new_map_value, xmis);
+    memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+    memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
   }
+
   // Accumulate cyclic refresh update counters.
   if (!dry_run) {
-    if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
+    if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1)
       x->actual_num_seg1_blocks += xmis * ymis;
-    else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
-             CR_SEGMENT_ID_BOOST2)
+    else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2)
       x->actual_num_seg2_blocks += xmis * ymis;
   }
 }
@@ -314,15 +299,14 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   uint64_t sb_sad = 0;
   uint64_t thresh_sad_low = 0;
   uint64_t thresh_sad = INT64_MAX;
-  memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols);
-  sb_cols = (mi_params->mi_cols + cm->seq_params->mib_size - 1) /
-            cm->seq_params->mib_size;
-  sb_rows = (mi_params->mi_rows + cm->seq_params->mib_size - 1) /
-            cm->seq_params->mib_size;
+  const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols;
+  const int mi_stride = mi_cols;
+  memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols);
+  sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+  sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
-  block_count =
-      cr->percent_refresh * mi_params->mi_rows * mi_params->mi_cols / 100;
+  block_count = cr->percent_refresh * mi_rows * mi_cols / 100;
   // Set the segmentation map: cycle through the superblocks, starting at
   // cr->mb_index, and stopping when either block_count blocks have been found
   // to be refreshed, or we have passed through whole frame.
@@ -337,16 +321,17 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * cm->seq_params->mib_size;
     int mi_col = sb_col_index * cm->seq_params->mib_size;
-    assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
-    assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
-    bl_index = mi_row * mi_params->mi_cols + mi_col;
+    assert(mi_row >= 0 && mi_row < mi_rows);
+    assert(mi_col >= 0 && mi_col < mi_cols);
+    bl_index = mi_row * mi_stride + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params->mib_size);
-    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params->mib_size);
-    if (cpi->sf.rt_sf.sad_based_comp_prune && cr->use_block_sad_scene_det &&
-        cpi->rc.frames_since_key > 30 &&
+    xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size);
+    ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size);
+    if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 &&
         cr->counter_encode_maxq_scene_change > 30 &&
-        cpi->src_sad_blk_64x64 != NULL) {
+        cpi->src_sad_blk_64x64 != NULL &&
+        cpi->svc.number_temporal_layers == 1 &&
+        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
       sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index];
       int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8;
       int scale_low = 2;
@@ -356,7 +341,7 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     // cr_map only needed at 8x8 blocks.
     for (y = 0; y < ymis; y += 2) {
       for (x = 0; x < xmis; x += 2) {
-        const int bl_index2 = bl_index + y * mi_params->mi_cols + x;
+        const int bl_index2 = bl_index + y * mi_stride + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than low motion.
@@ -372,10 +357,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     // If segment is at least half of superblock, set to 1.
     // Enforce that block sad (sb_sad) is not too high.
     if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) {
-      for (y = 0; y < ymis; y++)
-        for (x = 0; x < xmis; x++) {
-          seg_map[bl_index + y * mi_params->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
-        }
+      set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride,
+                     CR_SEGMENT_ID_BOOST1);
       cr->target_num_seg_blocks += xmis * ymis;
     }
     i++;
@@ -390,6 +373,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   }
 }
 
+static int is_scene_change_detected(AV1_COMP *const cpi) {
+  return cpi->rc.high_source_sad;
+}
+
 // Set cyclic refresh parameters.
 void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // TODO(marpan): Parameters need to be tuned.
@@ -405,9 +392,15 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
     qp_thresh = AOMMIN(35, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
-  const int scene_change_detected =
-      cpi->rc.high_source_sad ||
-      (cpi->ppi->use_svc && cpi->svc.high_source_sad_superframe);
+  const int scene_change_detected = is_scene_change_detected(cpi);
+
+  // Cases to reset the cyclic refresh adjustment parameters.
+  if (frame_is_intra_only(cm) || scene_change_detected) {
+    // Reset adaptive elements for intra only frames and scene changes.
+    cr->percent_refresh_adjustment = 5;
+    cr->rate_ratio_qdelta_adjustment = 0.25;
+  }
+
   // Although this segment feature for RTC is only used for
   // blocks >= 8X8, for more efficient coding of the seg map
   // cur_frame->seg_map needs to set at 4x4 along with the
@@ -417,6 +410,8 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // Also if loop-filter deltas is applied via segment, then
   // we need to set cr->skip_over4x4 = 1.
   cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
+
+  // should we enable cyclic refresh on this frame.
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
       scene_change_detected || cpi->svc.temporal_layer_id > 0 ||
@@ -430,14 +425,13 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
-  cr->percent_refresh = 10;
-  // Increase the amount of refresh for #temporal_layers > 2, and for some
-  // frames after scene change that is encoded at high Q.
+
+  // Increase the amount of refresh for #temporal_layers > 2
   if (cpi->svc.number_temporal_layers > 2)
     cr->percent_refresh = 15;
-  else if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-           cr->counter_encode_maxq_scene_change < 20)
-    cr->percent_refresh = 15;
+  else
+    cr->percent_refresh = 10 + cr->percent_refresh_adjustment;
+
   cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->use_block_sad_scene_det =
@@ -454,9 +448,9 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   if (cr->percent_refresh > 0 &&
       rc->frames_since_key <
           (4 * cpi->svc.number_temporal_layers) * (100 / cr->percent_refresh)) {
-    cr->rate_ratio_qdelta = 3.0;
+    cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment;
   } else {
-    cr->rate_ratio_qdelta = 2.0;
+    cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
   }
   // Adjust some parameters for low resolutions.
   if (cm->width * cm->height <= 352 * 288) {
@@ -508,12 +502,16 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  const int scene_change_detected =
-      cpi->rc.high_source_sad ||
-      (cpi->ppi->use_svc && cpi->svc.high_source_sad_superframe);
+  const int scene_change_detected = is_scene_change_detected(cpi);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
   const int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
+
   if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
   if (!cr->apply_cyclic_refresh) {
     // Set segmentation map to 0 and disable.
@@ -572,7 +570,11 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
     const int qindex2 = clamp(
         quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta,
         0, MAXQ);
-    cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
+    cr->rdmult = av1_compute_rd_mult(
+        qindex2, cm->seq_params->bit_depth,
+        cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+        boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+        is_stat_consumption_stage(cpi));
 
     av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
 
@@ -601,6 +603,8 @@ void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
   cpi->refresh_frame.golden_frame = true;
   cr->apply_cyclic_refresh = 0;
   cr->counter_encode_maxq_scene_change = 0;
+  cr->percent_refresh_adjustment = 5;
+  cr->rate_ratio_qdelta_adjustment = 0.25;
 }
 
 int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index ecb5d6d14..3353c5a45 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h
@@ -39,6 +39,12 @@ struct CYCLIC_REFRESH {
    * for cyclic refresh.
    */
   int percent_refresh;
+
+  /*!
+   * Active adjustment delta for cyclic refresh for rate control.
+   */
+  int percent_refresh_adjustment;
+
   /*!
    * Maximum q-delta as percentage of base q.
    */
@@ -94,6 +100,12 @@ struct CYCLIC_REFRESH {
    * Rate target ratio to set q delta.
    */
   double rate_ratio_qdelta;
+
+  /*!
+   * Active adjustment of qdelta rate ratio for enhanced rate control
+   */
+  double rate_ratio_qdelta_adjustment;
+
   /*!
    * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
    */
@@ -171,14 +183,16 @@ int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
  * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
  * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
  * \param[in]   bsize     Block size
+ * \param[in]   dry_run   A code indicating whether it is part of the final
+ *                        pass for reconstructing the superblock
  *
- * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
  * the \c cm->cpi->enc_seg.map.
  */
 
 void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
                                    MACROBLOCK *const x, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize);
+                                   BLOCK_SIZE bsize, RUN_TYPE dry_run);
 
 /*!\brief Update segment_id for block based on mode selected.
  *
@@ -201,7 +215,7 @@ void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
  * \param[in]   dry_run   A code indicating whether it is part of the final
  *                         pass for reconstructing the superblock
  *
- * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
  * the \c cm->cpi->enc_seg.map.
  */
 void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
@@ -221,7 +235,7 @@ void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
  *
  * \param[in]   x         Pointer to MACROBLOCK structure
  *
- * \return Update the \c x->cnt_zeromv, the \c x->actual_num_seg1_blocks and
+ * \remark Update the \c x->cnt_zeromv, the \c x->actual_num_seg1_blocks and
  * the \c x->actual_num_seg1_blocks.
  */
 void av1_init_cyclic_refresh_counters(MACROBLOCK *const x);
@@ -238,7 +252,7 @@ void av1_init_cyclic_refresh_counters(MACROBLOCK *const x);
  * \param[in]   cyclic_refresh Pointer to CYCLIC_REFRESH structure
  * \param[in]   x              Pointer to MACROBLOCK structure
  *
- * \return Update the \c cyclic_refresh->cnt_zeromv, the \c
+ * \remark Update the \c cyclic_refresh->cnt_zeromv, the \c
  * cyclic_refresh->actual_num_seg1_blocks and the \c
  * cyclic_refresh->actual_num_seg1_blocks.
  */
@@ -253,7 +267,7 @@ void av1_accumulate_cyclic_refresh_counters(
  *
  * \param[in]   cpi       Top level encoder structure
  *
- * \return Returns the interval in \c cpi->rc.baseline_gf_interval.
+ * \remark Returns the interval in \c cpi->rc.baseline_gf_interval.
  */
 void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
 
@@ -270,7 +284,7 @@ void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
  *
  * \param[in]       cpi          Top level encoder structure
  *
- * \return Updates the \c cpi->cyclic_refresh with the settings.
+ * \remark Updates the \c cpi->cyclic_refresh with the settings.
  */
 void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
 
@@ -284,7 +298,7 @@ void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
  *
  * \param[in]       cpi          Top level encoder structure
  *
- * \return Updates the \c cpi->cyclic_refresh with the cyclic refresh
+ * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh
  * parameters and the \c cm->seg with the segmentation data.
  */
 void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index 3273ef8ed..d53d2c93a 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c
@@ -74,10 +74,9 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
     for (i = 0; i < MAX_SEGMENTS; ++i) {
       // Set up avg segment id to be 1.0 and adjust the other segments around
       // it.
-      int qindex_delta = av1_compute_qdelta_by_rate(
-          &cpi->rc, cm->current_frame.frame_type, base_qindex,
-          rate_ratio[i] / avg_ratio, cpi->is_screen_content_type,
-          cm->seq_params->bit_depth);
+      int qindex_delta =
+          av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+                                     base_qindex, rate_ratio[i] / avg_ratio);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -212,10 +211,9 @@ int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
     rate_level = block_var_level;
   }
   const int base_qindex = cm->quant_params.base_qindex;
-  int qindex_delta = av1_compute_qdelta_by_rate(
-      &cpi->rc, cm->current_frame.frame_type, base_qindex,
-      deltaq_rate_ratio[rate_level], cpi->is_screen_content_type,
-      cm->seq_params->bit_depth);
+  int qindex_delta =
+      av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex,
+                                 deltaq_rate_ratio[rate_level]);
 
   if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
     qindex_delta = -base_qindex + 1;
diff --git a/av1/encoder/arm/crc32/hash_crc32.c b/av1/encoder/arm/crc32/hash_crc32.c
new file mode 100644
index 000000000..dd8685dcf
--- /dev/null
+++ b/av1/encoder/arm/crc32/hash_crc32.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <arm_acle.h>
+
+#define CRC_LOOP(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+#define CRC_SINGLE(op, crc, type, buf, len) \
+  if ((len) >= sizeof(type)) {              \
+    (crc) = op((crc), *(type *)(buf));      \
+    (len) -= sizeof(type);                  \
+    buf += sizeof(type);                    \
+  }
+
+/* Return 32-bit CRC for the input buffer.
+ * Polynomial is 0x1EDC6F41.
+ */
+
+uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p,
+                                        size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+#if !defined(__aarch64__)
+  // Align input to 8-byte boundary (only necessary for 32-bit builds.)
+  while (len && ((uintptr_t)buf & 7)) {
+    crc = __crc32cb(crc, *buf++);
+    len--;
+  }
+#endif
+
+  CRC_LOOP(__crc32cd, crc, uint64_t, buf, len)
+  CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len)
+  CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len)
+  CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len)
+
+  return ~crc;
+}
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index 153a2382b..8a282b34d 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -274,23 +274,32 @@ static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
   vst1q_s32((b + 4), b_hi);
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *in,
                                                  const int stride,
                                                  int16x8_t *const out,
                                                  const int out_size) {
-  for (int i = 0; i < out_size; ++i)
-    out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
-        (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
+  for (int i = 0; i < out_size; ++i) {
+    // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
+    // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
+    // the upper lane is unused or further modified after this call. The
+    // latency should be similar between the two.
+    out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+    in += stride;
+  }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *in,
                                                       const int stride,
                                                       int16x8_t *const out,
                                                       const int out_size) {
-  for (int i = 0; i < out_size; ++i)
-    out[out_size - i - 1] = vreinterpretq_s16_u64(
-        vld1q_lane_u64((uint64_t *)(in + i * stride),
-                       vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
+  for (int i = out_size - 1; i >= 0; --i) {
+    // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
+    // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
+    // the upper lane is unused or further modified after this call. The
+    // latency should be similar between the two.
+    out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+    in += stride;
+  }
 }
 
 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
@@ -2298,10 +2307,9 @@ void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip) {
     load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
   } else {
@@ -2342,10 +2350,9 @@ void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip) {
     load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
   } else {
@@ -2384,10 +2391,9 @@ void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip) {
     load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
   } else {
@@ -2430,10 +2436,9 @@ void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip)
     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
   else
@@ -2471,10 +2476,9 @@ void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip)
     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
   else
@@ -2512,10 +2516,9 @@ void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip) {
     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
   } else {
@@ -2558,10 +2561,9 @@ void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   if (ud_flip) {
     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
   } else {
@@ -2607,10 +2609,9 @@ void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   for (int i = 0; i < 2; i++) {
     if (ud_flip) {
       load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
@@ -2654,10 +2655,9 @@ void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
   for (int i = 0; i < 2; i++) {
     if (ud_flip) {
       load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
@@ -2700,10 +2700,9 @@ void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
 
   for (int i = 0; i < 2; i++) {
     if (ud_flip) {
@@ -2753,10 +2752,9 @@ void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
   if (col_txfm != NULL && row_txfm != NULL) {
     int ud_flip, lr_flip;
     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+    const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+    const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+    const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
 
     for (int i = 0; i < 2; i++) {
       if (ud_flip) {
@@ -2812,10 +2810,9 @@ void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
   if (col_txfm != NULL && row_txfm != NULL) {
     int ud_flip, lr_flip;
     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+    const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+    const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+    const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
 
     for (int i = 0; i < 4; i++) {
       if (ud_flip) {
@@ -2872,10 +2869,9 @@ void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
   const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
 
   if (col_txfm != NULL && row_txfm != NULL) {
-    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
-    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
-    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
-    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+    const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
+    const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
+    const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
     int ud_flip, lr_flip;
     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
diff --git a/av1/encoder/arm/neon/temporal_filter_neon.c b/av1/encoder/arm/neon/temporal_filter_neon.c
new file mode 100644
index 000000000..cae44f9a1
--- /dev/null
+++ b/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+// clang-format off
+
+DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
+  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+  0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+  0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+  0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+// clang-format on
+
+static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+                                const uint8_t *frame2, const uint32_t stride2,
+                                const uint32_t block_width,
+                                const uint32_t block_height,
+                                uint8_t *frame_abs_diff,
+                                const unsigned int dst_stride) {
+  uint8_t *dst = frame_abs_diff;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+      uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      vst1q_u8(dst + j + 2, abs_diff);
+      j += 16;
+    } while (j < block_width);
+
+    dst += dst_stride;
+    i++;
+  } while (i < block_height);
+}
+
+static INLINE uint8x16_t load_and_pad(uint8_t *src, const uint32_t col,
+                                      const uint32_t block_width) {
+  uint8x8_t s = vld1_u8(src);
+
+  if (col == 0) {
+    s[0] = s[2];
+    s[1] = s[2];
+  } else if (col >= block_width - 4) {
+    s[6] = s[5];
+    s[7] = s[5];
+  }
+  return vcombine_u8(s, s);
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+    const uint32_t block_height, const int *subblock_mses,
+    unsigned int *accumulator, uint16_t *count, uint8_t *frame_abs_diff,
+    uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW];
+  const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+
+  // Traverse 4 columns at a time - first and last two columns need padding.
+  for (uint32_t col = 0; col < block_width; col += 4) {
+    uint8x16_t vsrc[5][2];
+    uint8_t *src = frame_abs_diff + col;
+
+    // Load, pad (for first and last two columns) and mask 3 rows from the top.
+    for (int i = 2; i < 5; i++) {
+      uint8x16_t s = load_and_pad(src, col, block_width);
+      vsrc[i][0] = vandq_u8(s, vmask.val[0]);
+      vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+      src += SSE_STRIDE;
+    }
+
+    // Pad the top 2 rows.
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (unsigned int row = 0; row < block_height; row++) {
+      uint32x4_t sum_01 = vdupq_n_u32(0);
+      uint32x4_t sum_23 = vdupq_n_u32(0);
+
+      sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
+
+      sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
+
+      vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
+
+      // Push all rows in the sliding window up one.
+      for (int i = 0; i < 4; i++) {
+        vsrc[i][0] = vsrc[i + 1][0];
+        vsrc[i][1] = vsrc[i + 1][1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row into the bottom of the sliding window.
+        uint8x16_t s = load_and_pad(src, col, block_width);
+        vsrc[4][0] = vandq_u8(s, vmask.val[0]);
+        vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+        src += SSE_STRIDE;
+      } else {
+        // Pad the bottom 2 rows.
+        vsrc[4][0] = vsrc[3][0];
+        vsrc[4][1] = vsrc[3][1];
+      }
+    }
+  }
+
+  // Perform filtering.
+  for (unsigned int i = 0, k = 0; i < block_height; i++) {
+    for (unsigned int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame[i * stride + j];
+      uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+      const double window_error = diff_sse * inv_num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
+      // Compute filter weight.
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+      accumulator[k] += weight * pixel_value;
+      count[k] += weight;
+    }
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
+// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
+DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
+  0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
+  0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+  0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+  0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
+};
+
+static INLINE void get_squared_error(
+    const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2,
+    const uint32_t stride2, const uint32_t block_width,
+    const uint32_t block_height, uint16_t *frame_sse,
+    const unsigned int dst_stride) {
+  uint16_t *dst = frame_sse;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+      uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      uint16x8_t sse_lo =
+          vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
+      uint16x8_t sse_hi =
+          vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff));
+
+      vst1q_u16(dst + j + 2, sse_lo);
+      vst1q_u16(dst + j + 10, sse_hi);
+
+      j += 16;
+    } while (j < block_width);
+
+    dst += dst_stride;
+    i++;
+  } while (i < block_height);
+}
+
+static INLINE uint16x8_t load_and_pad(uint16_t *src, const uint32_t col,
+                                      const uint32_t block_width) {
+  uint16x8_t s = vld1q_u16(src);
+
+  if (col == 0) {
+    s[0] = s[2];
+    s[1] = s[2];
+  } else if (col >= block_width - 4) {
+    s[6] = s[5];
+    s[7] = s[5];
+  }
+  return s;
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+    const uint32_t block_height, const int *subblock_mses,
+    unsigned int *accumulator, uint16_t *count, uint16_t *frame_sse,
+    uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW];
+  const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
+
+  // Traverse 4 columns at a time - first and last two columns need padding.
+  for (uint32_t col = 0; col < block_width; col += 4) {
+    uint16x8_t vsrc[5];
+    uint16_t *src = frame_sse + col;
+
+    // Load and pad (for first and last two columns) 3 rows from the top.
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = load_and_pad(src, col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Pad the top 2 rows.
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (unsigned int row = 0; row < block_height; row++) {
+      for (int i = 0; i < 4; i++) {
+        uint32x4_t vsum = vdupq_n_u32(0);
+        for (int j = 0; j < 5; j++) {
+          vsum = vpadalq_u16(vsum, vandq_u16(vsrc[j], vmask.val[i]));
+        }
+        acc_5x5_neon[row][col + i] = horizontal_add_u32x4(vsum);
+      }
+
+      // Push all rows in the sliding window up one.
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row into the bottom of the sliding window.
+        vsrc[4] = load_and_pad(src, col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        // Pad the bottom 2 rows.
+        vsrc[4] = vsrc[3];
+      }
+    }
+  }
+
+  // Perform filtering.
+  for (unsigned int i = 0, k = 0; i < block_height; i++) {
+    for (unsigned int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame[i * stride + j];
+      uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+      const double window_error = diff_sse * inv_num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
+      // Compute filter weight.
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+      accumulator[k] += weight * pixel_value;
+      count[k] += weight;
+    }
+  }
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+void av1_apply_temporal_filter_neon(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+  uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
+#else   // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] +=
+                  (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+                   frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+            }
+          }
+        }
+      }
+    }
+
+    get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                 plane_h, frame_abs_diff, SSE_STRIDE);
+
+    apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+                          subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_abs_diff, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor);
+#else   // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                      plane_h, frame_sse, SSE_STRIDE);
+
+    apply_temporal_filter(
+        pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
+        accum + plane_offset, count + plane_offset, frame_sse, luma_sse_sum,
+        inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor);
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/av1/encoder/arm/neon/wedge_utils_neon.c b/av1/encoder/arm/neon/wedge_utils_neon.c
new file mode 100644
index 000000000..54d8d1911
--- /dev/null
+++ b/av1/encoder/arm/neon/wedge_utils_neon.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c for details of the parameters and
+ * computation.
+ */
+uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  assert(N % 64 == 0);
+
+  uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  int i = 0;
+  do {
+    int32x4_t sum[4];
+    int32x4_t sse[2];
+    int16x4_t sum_s16[4];
+
+    const int16x8_t r1_l = vld1q_s16(r1 + i);
+    const int16x8_t r1_h = vld1q_s16(r1 + i + 8);
+    const int16x8_t d_l = vld1q_s16(d + i);
+    const int16x8_t d_h = vld1q_s16(d + i + 8);
+    // The following three lines are a bit inelegant compared to using a pair
+    // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair -
+    // which can be executed in parallel with the subsequent SSHL instructions.
+    // (SSHL can only be executed on half of the Neon pipes in modern Arm
+    // cores, whereas ZIP1/2 can be executed on all of them.)
+    const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0));
+    const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]);
+    const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]);
+
+    sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS);
+    sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS);
+    sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS);
+    sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS);
+
+    sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l));
+    sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l));
+    sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h));
+    sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h));
+
+    sum_s16[0] = vqmovn_s32(sum[0]);
+    sum_s16[1] = vqmovn_s32(sum[1]);
+    sum_s16[2] = vqmovn_s32(sum[2]);
+    sum_s16[3] = vqmovn_s32(sum[3]);
+
+    sse[0] = vmull_s16(sum_s16[0], sum_s16[0]);
+    sse[1] = vmull_s16(sum_s16[2], sum_s16[2]);
+    sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]);
+    sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]);
+
+    v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0]));
+    v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1]));
+
+    i += 16;
+  } while (i < N);
+
+  uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index db61dbc33..97652cf87 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -745,14 +745,37 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id, const int do_update) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonQuantParams *const quant_params = &cm->quant_params;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  int qindex_rd;
+
   const int current_qindex = AOMMAX(
       0,
       AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
                                    ? quant_params->base_qindex + x->delta_qindex
                                    : quant_params->base_qindex));
   const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
-  const int rdmult =
-      av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
+
+  if (cpi->oxcf.sb_qp_sweep) {
+    const int current_rd_qindex =
+        AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+                                               ? quant_params->base_qindex +
+                                                     x->rdmult_delta_qindex
+                                               : quant_params->base_qindex));
+    qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex);
+  } else {
+    qindex_rd = qindex;
+  }
+
+  const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q;
+  const int rdmult = av1_compute_rd_mult(
+      qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
   const int qindex_change = x->qindex != qindex;
   if (qindex_change || do_update) {
     av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x);
@@ -767,7 +790,7 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
 
   av1_set_error_per_bit(&x->errorperbit, rdmult);
-  av1_set_sad_per_bit(cpi, &x->sadperbit, qindex);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd);
 
   x->prev_segment_id = segment_id;
 }
diff --git a/av1/encoder/av1_temporal_denoiser.c b/av1/encoder/av1_temporal_denoiser.c
index 27a12cb03..87ae763ca 100644
--- a/av1/encoder/av1_temporal_denoiser.c
+++ b/av1/encoder/av1_temporal_denoiser.c
@@ -233,7 +233,7 @@ static AV1_DENOISER_DECISION perform_motion_compensation(
         frame == ALTREF_FRAME ||
         (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
         (frame != LAST_FRAME &&
-         ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+         ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
           denoiser->denoising_level >= kDenHigh))) {
       frame = LAST_FRAME;
       ctx->newmv_sse = ctx->zeromv_lastref_sse;
@@ -348,8 +348,9 @@ void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
     decision = perform_motion_compensation(
         &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
         motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
-        cpi->source->y_width, cpi->svc.ref_idx[0], cpi->svc.ref_idx[3],
-        cpi->ppi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+        cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0],
+        cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc,
+        cpi->svc.spatial_layer_id, use_gf_temporal_ref);
 
   if (decision == FILTER_BLOCK) {
     decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -395,10 +396,11 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
 }
 
 void av1_denoiser_update_frame_info(
-    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
-    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
-    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
-    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+    struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+    int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+    int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_refresh_denoiser_buffers, int second_spatial_layer) {
   const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
   // Copy source into denoised reference buffers on KEY_FRAME or
   // if the just encoded frame was resized. For SVC, copy source if the base
@@ -415,10 +417,10 @@ void av1_denoiser_update_frame_info(
     return;
   }
 
-  if (svc->set_ref_frame_config) {
+  if (rtc_ref->set_ref_frame_config) {
     int i;
     for (i = 0; i < REF_FRAMES; i++) {
-      if (svc->refresh[svc->spatial_layer_id] & (1 << i))
+      if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i))
         copy_frame(&denoiser->running_avg_y[i + 1 + shift],
                    &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
@@ -497,15 +499,16 @@ static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm,
 }
 
 int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
-                             struct SVC *svc, int svc_buf_shift,
-                             int refresh_alt, int refresh_gld, int refresh_lst,
-                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
+                             struct RTC_REF *rtc_ref, struct SVC *svc,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx) {
   int fail = 0;
-  if (svc->set_ref_frame_config) {
+  if (rtc_ref->set_ref_frame_config) {
     int i;
     for (i = 0; i < REF_FRAMES; i++) {
       if (cm->current_frame.frame_type == KEY_FRAME ||
-          svc->refresh[svc->spatial_layer_id] & (1 << i)) {
+          rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) {
         fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
                                                i + 1 + svc_buf_shift);
       }
@@ -671,7 +674,7 @@ void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) {
 int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
                               CONTENT_STATE_SB content_state,
                               int temporal_layer_id) {
-  if ((content_state.source_sad_nonrd == kLowSad &&
+  if ((content_state.source_sad_nonrd <= kLowSad &&
        content_state.low_sumdiff) ||
       (content_state.source_sad_nonrd == kHighSad &&
        content_state.low_sumdiff) ||
@@ -691,10 +694,10 @@ int64_t av1_scale_acskip_thresh(int64_t threshold,
                                 AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
                                 int temporal_layer_id) {
   if (noise_level >= kDenLow && abs_sumdiff < 5)
-    return threshold *=
-           (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
-  else
-    return threshold;
+    threshold *= (noise_level == kDenLow)   ? 2
+                 : (temporal_layer_id == 2) ? 10
+                                            : 6;
+  return threshold;
 }
 
 void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) {
@@ -710,6 +713,7 @@ void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) {
 
 void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
   SVC *const svc = &cpi->svc;
 
   if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
@@ -739,7 +743,8 @@ void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
           svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
       // Check if we need to allocate extra buffers in the denoiser
       // for refreshed frames.
-      if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift,
+      if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref,
+                                   svc, svc_buf_shift,
                                    cpi->refresh_alt_ref_frame,
                                    cpi->refresh_golden_frame,
                                    cpi->refresh_last_frame, cpi->alt_fb_idx,
@@ -749,10 +754,10 @@ void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
 #endif
     }
     av1_denoiser_update_frame_info(
-        &cpi->denoiser, *cpi->source, svc, frame_type,
+        &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type,
         cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1,
-        svc->ref_idx[6], svc->ref_idx[3], svc->ref_idx[0], resize_pending,
-        svc_refresh_denoiser_buffers, denoise_svc_second_layer);
+        rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0],
+        resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer);
   }
 }
 
diff --git a/av1/encoder/av1_temporal_denoiser.h b/av1/encoder/av1_temporal_denoiser.h
index 71c8c1c0e..14dcccce6 100644
--- a/av1/encoder/av1_temporal_denoiser.h
+++ b/av1/encoder/av1_temporal_denoiser.h
@@ -69,12 +69,14 @@ typedef struct {
 
 struct AV1_COMP;
 struct SVC;
+struct RTC_REF;
 
 void av1_denoiser_update_frame_info(
-    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
-    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
-    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
-    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer);
+    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+    struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+    int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+    int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_refresh_denoiser_buffers, int second_spatial_layer);
 
 void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row,
                           int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -88,9 +90,10 @@ void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
                                      PICK_MODE_CONTEXT *ctx);
 
 int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
-                             struct SVC *svc, int svc_buf_shift,
-                             int refresh_alt, int refresh_gld, int refresh_lst,
-                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
+                             struct RTC_REF *rtc, struct SVC *svc,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx);
 
 int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
                        int use_svc, int noise_sen, int width, int height,
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 6e4af8188..4f853078c 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -219,7 +219,8 @@ static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
 }
 
 static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MB_MODE_INFO *mi, aom_writer *w) {
+                      uint8_t segment_id, const MB_MODE_INFO *mi,
+                      aom_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
@@ -232,7 +233,7 @@ static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 }
 
 static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                           int segment_id, const MB_MODE_INFO *mi,
+                           uint8_t segment_id, const MB_MODE_INFO *mi,
                            aom_writer *w) {
   if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
@@ -257,7 +258,7 @@ static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 }
 
 static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
-                                      const MACROBLOCKD *xd, int segment_id,
+                                      const MACROBLOCKD *xd, uint8_t segment_id,
                                       aom_writer *w, const int is_inter) {
   if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
@@ -414,18 +415,16 @@ static AOM_INLINE void pack_txb_tokens(
 
 static INLINE void set_spatial_segment_id(
     const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
-    BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) {
+    BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) {
   const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
   const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
 
-  for (int y = 0; y < ymis; ++y) {
-    for (int x = 0; x < xmis; ++x) {
-      segment_ids[mi_offset + y * mi_params->mi_cols + x] = segment_id;
-    }
-  }
+  const int mi_stride = mi_params->mi_cols;
+
+  set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id);
 }
 
 int av1_neg_interleave(int x, int ref, int max) {
@@ -462,8 +461,8 @@ static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
 
   AV1_COMMON *const cm = &cpi->common;
   int cdf_num;
-  const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num,
-                                            cpi->cyclic_refresh->skip_over4x4);
+  const uint8_t pred = av1_get_spatial_seg_pred(
+      cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
@@ -498,7 +497,7 @@ static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
                                         const MACROBLOCKD *xd, aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
-  const int segment_id = mbmi->segment_id;
+  const uint8_t segment_id = mbmi->segment_id;
 
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
@@ -1096,7 +1095,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
   const PREDICTION_MODE mode = mbmi->mode;
-  const int segment_id = mbmi->segment_id;
+  const uint8_t segment_id = mbmi->segment_id;
   const BLOCK_SIZE bsize = mbmi->bsize;
   const int allow_hp = cm->features.allow_high_precision_mv;
   const int is_inter = is_inter_block(mbmi);
@@ -1532,7 +1531,7 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
 
   const int is_inter_tx = is_inter_block(mbmi);
   const int skip_txfm = mbmi->skip_txfm;
-  const int segment_id = mbmi->segment_id;
+  const uint8_t segment_id = mbmi->segment_id;
   if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
       !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) {
     if (is_inter_tx) {  // This implies skip flag is 0.
@@ -2681,6 +2680,12 @@ static AOM_INLINE void write_global_motion_params(
     struct aom_write_bit_buffer *wb, int allow_hp) {
   const TransformationType type = params->wmtype;
 
+  // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION
+  // type models. Check here that we don't accidentally pick one somehow.
+  // See comments in gm_get_motion_vector() for details on the bug we're
+  // working around here
+  assert(type != TRANSLATION);
+
   aom_wb_write_bit(wb, type != IDENTITY);
   if (type != IDENTITY) {
     aom_wb_write_bit(wb, type == ROTZOOM);
@@ -2764,7 +2769,31 @@ static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
   }
 }
 
-static int check_frame_refs_short_signaling(AV1_COMMON *const cm) {
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm,
+                                            bool enable_ref_short_signaling) {
+  // In rtc case when res < 360p and speed >= 9, we turn on
+  // frame_refs_short_signaling if it won't break the decoder.
+  if (enable_ref_short_signaling) {
+    const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+    const int base =
+        1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+    const int order_hint_group_cur =
+        cm->current_frame.display_order_hint / base;
+    const int order_hint_group_gld =
+        cm->ref_frame_map[gld_map_idx]->display_order_hint / base;
+    const int relative_dist = cm->current_frame.order_hint -
+                              cm->ref_frame_map[gld_map_idx]->order_hint;
+
+    // If current frame and GOLDEN frame are in the same order_hint group, and
+    // they are not far apart (i.e., > 64 frames), then return 1.
+    if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 &&
+        relative_dist <= 64) {
+      return 1;
+    }
+    return 0;
+  }
+
   // Check whether all references are distinct frames.
   const RefCntBuffer *seen_bufs[FRAME_BUFFERS] = { NULL };
   int num_refs = 0;
@@ -2842,7 +2871,13 @@ static AOM_INLINE void write_uncompressed_header_obu(
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
 
-  current_frame->frame_refs_short_signaling = 0;
+  if (!cpi->sf.rt_sf.enable_ref_short_signaling ||
+      !seq_params->order_hint_info.enable_order_hint ||
+      seq_params->order_hint_info.enable_ref_frame_mvs) {
+    current_frame->frame_refs_short_signaling = 0;
+  } else {
+    current_frame->frame_refs_short_signaling = 1;
+  }
 
   if (seq_params->still_picture) {
     assert(cm->show_existing_frame == 0);
@@ -3008,12 +3043,20 @@ static AOM_INLINE void write_uncompressed_header_obu(
 #endif  // FRAME_REFS_SHORT_SIGNALING
 
       if (current_frame->frame_refs_short_signaling) {
-        // NOTE(zoeliu@google.com):
-        //   An example solution for encoder-side implementation on frame refs
-        //   short signaling, which is only turned on when the encoder side
-        //   decision on ref frames is identical to that at the decoder side.
+        //    In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true,
+        //    we turn on frame_refs_short_signaling when the current frame and
+        //    golden frame are in the same order_hint group, and their relative
+        //    distance is <= 64 (in order to be decodable).
+
+        //    For other cases, an example solution for encoder-side
+        //    implementation on frame_refs_short_signaling is also provided in
+        //    this function, where frame_refs_short_signaling is only turned on
+        //    when the encoder side decision on ref frames is identical to that
+        //    at the decoder side.
+
         current_frame->frame_refs_short_signaling =
-            check_frame_refs_short_signaling(cm);
+            check_frame_refs_short_signaling(
+                cm, cpi->sf.rt_sf.enable_ref_short_signaling);
       }
 
       if (seq_params->order_hint_info.enable_order_hint)
@@ -3399,6 +3442,7 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
         aom_wb_write_bit(
             &wb, seq_params->op_params[i].display_model_param_present_flag);
         if (seq_params->op_params[i].display_model_param_present_flag) {
+          assert(seq_params->op_params[i].initial_display_delay >= 1);
           assert(seq_params->op_params[i].initial_display_delay <= 10);
           aom_wb_write_literal(
               &wb, seq_params->op_params[i].initial_display_delay - 1, 4);
@@ -3602,7 +3646,7 @@ static void write_large_scale_tile_obu(
           }
         }
 
-        mem_put_le32(buf->data, tile_header);
+        mem_put_le32(buf->data, (MEM_VALUE_T)tile_header);
       }
 
       *total_size += tile_size;
@@ -4107,11 +4151,9 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   // write sequence header obu at each key frame or intra_only frame,
   // preceded by 4-byte size
   if (cm->current_frame.frame_type == INTRA_ONLY_FRAME ||
-      (cm->current_frame.frame_type == KEY_FRAME &&
-       cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET)) {
+      cm->current_frame.frame_type == KEY_FRAME) {
     obu_header_size = av1_write_obu_header(
         level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
-
     obu_payload_size =
         av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 0ad118db6..4185798ef 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -326,7 +326,7 @@ typedef struct {
   //! The best color map found.
   uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
   //! A temporary buffer used for k-means clustering.
-  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+  int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
 } PALETTE_BUFFER;
 
 /*! \brief Contains buffers used by av1_compound_type_rd()
@@ -496,7 +496,7 @@ typedef struct {
  */
 typedef struct {
   //! Whether to skip transform and quantization on a partition block level.
-  int skip_txfm;
+  uint8_t skip_txfm;
 
   /*! \brief Whether to skip transform and quantization on a txfm block level.
    *
@@ -801,13 +801,14 @@ typedef struct {
 /*!\cond */
 typedef enum {
   kZeroSad = 0,
-  kLowSad = 1,
-  kMedSad = 2,
-  kHighSad = 3
+  kVeryLowSad = 1,
+  kLowSad = 2,
+  kMedSad = 3,
+  kHighSad = 4
 } SOURCE_SAD;
 
 typedef struct {
-  //! SAD levels in non-rd path for var-based part and inter-mode search
+  //! SAD levels in non-rd path
   SOURCE_SAD source_sad_nonrd;
   //! SAD levels in rd-path for var-based part qindex thresholds
   SOURCE_SAD source_sad_rd;
@@ -942,6 +943,21 @@ typedef struct macroblock {
    */
   int delta_qindex;
 
+  /*! \brief Difference between frame-level qindex and qindex used to
+   * compute rdmult (lambda).
+   *
+   * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep.
+   * During qp sweep, delta_qindex is changed and used to calculate the actual
+   * quant params, while rdmult_delta_qindex remains the same, and is used to
+   * calculate the rdmult in "set_deltaq_rdmult".
+   */
+  int rdmult_delta_qindex;
+
+  /*! \brief Current qindex (before being adjusted by delta_q_res) used to
+   * derive rdmult_delta_qindex.
+   */
+  int rdmult_cur_qindex;
+
   /*! \brief Rate-distortion multiplier.
    *
    * The rd multiplier used to determine the rate-distortion trade-off. This is
@@ -1016,9 +1032,16 @@ typedef struct macroblock {
    */
   int cnt_zeromv;
 
-  /*!\brief Flag to force zeromv-skip block, for nonrd path.
+  /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path.
+   *
+   * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks
+   * in the superblock may be marked as zeromv-skip at block level.
+   */
+  int force_zeromv_skip_for_sb;
+
+  /*!\brief Flag to force zeromv-skip at block level, for nonrd path.
    */
-  int force_zeromv_skip;
+  int force_zeromv_skip_for_blk;
 
   /*! \brief Previous segment id for which qmatrices were updated.
    * This is used to bypass setting of qmatrices if no change in qindex.
@@ -1207,6 +1230,9 @@ typedef struct macroblock {
   PixelLevelGradientInfo *pixel_gradient_info;
   /*! \brief Flags indicating the availability of cached gradient info. */
   bool is_sb_gradient_cached[PLANE_TYPES];
+
+  /*! \brief Flag to reuse predicted samples of inter block. */
+  bool reuse_inter_pred;
   /**@}*/
 
   /*****************************************************************************
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 4f762b93e..39c505d43 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -465,7 +465,8 @@ static INLINE void compute_best_interintra_mode(
     INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd,
     INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
-  int rate, skip_txfm_sb;
+  int rate;
+  uint8_t skip_txfm_sb;
   int64_t dist, skip_sse_sb;
   const int bw = block_size_wide[bsize];
   mbmi->interintra_mode = interintra_mode;
@@ -688,7 +689,8 @@ static int handle_wedge_inter_intra_mode(
   const int_mv mv0 = mbmi->mv[0];
   // Refine motion vector for NEWMV case.
   if (have_newmv_in_inter_mode(mbmi->mode)) {
-    int rate_sum, skip_txfm_sb;
+    int rate_sum;
+    uint8_t skip_txfm_sb;
     int64_t dist_sum, skip_sse_sb;
     // get negative of mask
     const uint8_t *mask =
@@ -1048,7 +1050,8 @@ static int64_t masked_compound_type_rd(
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
   // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
   assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD);
-  int rate_sum, tmp_skip_txfm_sb;
+  int rate_sum;
+  uint8_t tmp_skip_txfm_sb;
   int64_t dist_sum, tmp_skip_sse_sb;
   pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge,
                                                         pick_interinter_seg };
@@ -1300,7 +1303,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
     // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
     if (cur_type < COMPOUND_WEDGE) {
       if (cpi->sf.inter_sf.enable_fast_compound_mode_search == 2) {
-        int rate_sum, tmp_skip_txfm_sb;
+        int rate_sum;
+        uint8_t tmp_skip_txfm_sb;
         int64_t dist_sum, tmp_skip_sse_sb;
 
         // Reuse data if matching record is found
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 76d136c37..f4c1ba349 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -194,7 +194,8 @@ static int choose_primary_ref_frame(
   // frame bit allocation.
   if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
 
-  if (cpi->ppi->use_svc) return av1_svc_primary_ref_frame(cpi);
+  if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config)
+    return av1_svc_primary_ref_frame(cpi);
 
   // Find the most recent reference frame with the same reference type as the
   // current frame
@@ -310,8 +311,7 @@ int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
 // Return the frame source, or NULL if we couldn't find one
 static struct lookahead_entry *choose_frame_source(
     AV1_COMP *const cpi, int *const flush, int *pop_lookahead,
-    struct lookahead_entry **last_source,
-    EncodeFrameParams *const frame_params) {
+    struct lookahead_entry **last_source, int *const show_frame) {
   AV1_COMMON *const cm = &cpi->common;
   const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   struct lookahead_entry *source = NULL;
@@ -353,7 +353,7 @@ static struct lookahead_entry *choose_frame_source(
     src_index = 0;
   }
 
-  frame_params->show_frame = *pop_lookahead;
+  *show_frame = *pop_lookahead;
 
 #if CONFIG_FPMT_TEST
   if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) {
@@ -365,7 +365,7 @@ static struct lookahead_entry *choose_frame_source(
         !is_stat_generation_stage(cpi))
       src_index = gf_group->src_offset[cpi->gf_frame_index];
   }
-  if (frame_params->show_frame) {
+  if (*show_frame) {
     // show frame, pop from buffer
     // Get last frame source.
     if (cm->current_frame.frame_number > 0) {
@@ -413,35 +413,35 @@ static void update_frame_flags(const AV1_COMMON *const cm,
                                const RefreshFrameInfo *const refresh_frame,
                                unsigned int *frame_flags) {
   if (encode_show_existing_frame(cm)) {
-    *frame_flags &= ~FRAMEFLAGS_GOLDEN;
-    *frame_flags &= ~FRAMEFLAGS_BWDREF;
-    *frame_flags &= ~FRAMEFLAGS_ALTREF;
-    *frame_flags &= ~FRAMEFLAGS_KEY;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
     return;
   }
 
   if (refresh_frame->golden_frame) {
     *frame_flags |= FRAMEFLAGS_GOLDEN;
   } else {
-    *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
   }
 
   if (refresh_frame->alt_ref_frame) {
     *frame_flags |= FRAMEFLAGS_ALTREF;
   } else {
-    *frame_flags &= ~FRAMEFLAGS_ALTREF;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
   }
 
   if (refresh_frame->bwd_ref_frame) {
     *frame_flags |= FRAMEFLAGS_BWDREF;
   } else {
-    *frame_flags &= ~FRAMEFLAGS_BWDREF;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
   }
 
   if (cm->current_frame.frame_type == KEY_FRAME) {
     *frame_flags |= FRAMEFLAGS_KEY;
   } else {
-    *frame_flags &= ~FRAMEFLAGS_KEY;
+    *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
   }
 }
 
@@ -616,8 +616,8 @@ int av1_get_refresh_frame_flags(
   // flags to 0 to keep things consistent.
   if (frame_params->show_existing_frame) return 0;
 
-  const SVC *const svc = &cpi->svc;
-  if (is_frame_droppable(svc, ext_refresh_frame_flags)) return 0;
+  const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+  if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0;
 
 #if !CONFIG_REALTIME_ONLY
   if (cpi->use_ducky_encode &&
@@ -630,10 +630,12 @@ int av1_get_refresh_frame_flags(
 
   int refresh_mask = 0;
   if (ext_refresh_frame_flags->update_pending) {
-    if (svc->set_ref_frame_config) {
+    if (rtc_ref->set_ref_frame_config ||
+        use_rtc_reference_structure_one_layer(cpi)) {
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
-        int ref_frame_map_idx = svc->ref_idx[i];
-        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+        int ref_frame_map_idx = rtc_ref->ref_idx[i];
+        refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+                        << ref_frame_map_idx;
       }
       return refresh_mask;
     }
@@ -792,6 +794,18 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
     }
 
     if (is_second_arf) {
+      // Allocate the memory for tf_buf_second_arf buffer, only when it is
+      // required.
+      int ret = aom_realloc_frame_buffer(
+          &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width,
+          oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x,
+          cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->oxcf.tool_cfg.enable_global_motion, 0);
+      if (ret)
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate tf_buf_second_arf");
+
       YV12_BUFFER_CONFIG *tf_buf_second_arf =
           &cpi->ppi->tf_info.tf_buf_second_arf;
       // We didn't apply temporal filtering for second arf ahead in
@@ -813,9 +827,12 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
     }
 
     // Copy source metadata to the temporal filtered frame
-    if (frame_input->source != source_buffer) {
-      aom_copy_metadata_to_frame_buffer(frame_input->source,
-                                        source_buffer->metadata);
+    if (source_buffer->metadata &&
+        aom_copy_metadata_to_frame_buffer(frame_input->source,
+                                          source_buffer->metadata)) {
+      aom_internal_error(
+          cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to copy source metadata to the temporal filtered frame");
     }
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -827,8 +844,7 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
   cm->show_frame = frame_params->show_frame;
   cm->current_frame.frame_type = frame_params->frame_type;
   // TODO(bohanli): Why is this? what part of it is necessary?
-  av1_set_frame_size(cpi, cm->superres_upscaled_width,
-                     cm->superres_upscaled_height);
+  av1_set_frame_size(cpi, cm->width, cm->height);
   if (set_mv_params) av1_set_mv_search_params(cpi);
 
 #if CONFIG_RD_COMMAND
@@ -980,12 +996,21 @@ void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
 #if !CONFIG_REALTIME_ONLY
   if (cpi->use_ducky_encode &&
       cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+    int valid_rf_idx = 0;
     for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) {
       if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) {
         remapped_ref_idx[rf - LAST_FRAME] =
             cpi->ppi->gf_group.ref_frame_list[gf_index][rf];
+        valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME];
       }
     }
+
+    for (int i = 0; i < REF_FRAMES; ++i) {
+      if (remapped_ref_idx[i] == INVALID_IDX)
+        remapped_ref_idx[i] = valid_rf_idx;
+    }
+
+    return;
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -1342,7 +1367,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     frame_params.show_frame = 1;
   } else {
     source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source,
-                                 &frame_params);
+                                 &frame_params.show_frame);
   }
 
   if (source == NULL) {  // If no source was found, we can't encode a frame.
@@ -1362,7 +1387,10 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
 
   // Source may be changed if temporal filtered later.
   frame_input.source = &source->img;
-  frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+  if (cpi->ppi->use_svc && last_source != NULL)
+    av1_svc_set_last_source(cpi, &frame_input, &last_source->img);
+  else
+    frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
   frame_input.ts_duration = source->ts_end - source->ts_start;
   // Save unfiltered source. It is used in av1_get_second_pass_params().
   cpi->unfiltered_source = frame_input.source;
@@ -1409,14 +1437,16 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   start_timing(cpi, av1_get_one_pass_rt_params_time);
 #endif
 #if CONFIG_REALTIME_ONLY
-  av1_get_one_pass_rt_params(cpi, &frame_params, &frame_input, *frame_flags);
-  if (use_one_pass_rt_reference_structure(cpi))
-    av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
+  av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+                             *frame_flags);
+  if (use_rtc_reference_structure_one_layer(cpi))
+    av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
 #else
   if (use_one_pass_rt_params) {
-    av1_get_one_pass_rt_params(cpi, &frame_params, &frame_input, *frame_flags);
-    if (use_one_pass_rt_reference_structure(cpi))
-      av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
+    av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+                               *frame_flags);
+    if (use_rtc_reference_structure_one_layer(cpi))
+      av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
   }
 #endif
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -1437,11 +1467,13 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   // this parameter should be used with caution.
   frame_params.speed = oxcf->speed;
 
-  // Work out some encoding parameters specific to the pass:
-  if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
-    av1_cyclic_refresh_update_parameters(cpi);
-  } else if (is_stat_generation_stage(cpi)) {
-    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
+#if !CONFIG_REALTIME_ONLY
+  // Set forced key frames when necessary. For two-pass encoding / lap mode,
+  // this is already handled by av1_get_second_pass_params. However when no
+  // stats are available, we still need to check if the new frame is a keyframe.
+  // For one pass rt, this is already checked in av1_get_one_pass_rt_params.
+  if (!use_one_pass_rt_params &&
+      (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) {
     // Current frame is coded as a key-frame for any of the following cases:
     // 1) First frame of a video
     // 2) For all-intra frame encoding
@@ -1452,9 +1484,18 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
         frame_update_type != INTNL_OVERLAY_UPDATE) {
       frame_params.frame_type = KEY_FRAME;
-    } else {
+    } else if (is_stat_generation_stage(cpi)) {
+      // For stats generation, set the frame type to inter here.
       frame_params.frame_type = INTER_FRAME;
     }
+  }
+#endif
+
+  // Work out some encoding parameters specific to the pass:
+  if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_update_parameters(cpi);
+  } else if (is_stat_generation_stage(cpi)) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
   } else if (is_stat_consumption_stage(cpi)) {
 #if CONFIG_MISMATCH_DEBUG
     mismatch_move_frame_idx_w();
@@ -1479,7 +1520,6 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
       gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all);
 
   if (!is_stat_generation_stage(cpi)) {
-    const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
     const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
 
     RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
@@ -1498,16 +1538,24 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
       if (!ext_flags->refresh_frame.update_pending) {
         av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi,
                            cpi->gf_frame_index, 1, cm->remapped_ref_idx);
-      } else if (cpi->svc.set_ref_frame_config) {
+      } else if (cpi->ppi->rtc_ref.set_ref_frame_config ||
+                 use_rtc_reference_structure_one_layer(cpi)) {
         for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
-          cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+          cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i];
       }
     }
 
     // Get the reference frames
+    bool has_ref_frames = false;
     for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-      ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]);
-      ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL;
+      const RefCntBuffer *ref_frame =
+          get_ref_frame_buf(cm, ref_frame_priority_order[i]);
+      ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL;
+      if (ref_frame != NULL) has_ref_frames = true;
+    }
+    if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME ||
+                            frame_params.frame_type == S_FRAME)) {
+      return AOM_CODEC_ERROR;
     }
 
     // Work out which reference frame slots may be used.
@@ -1563,7 +1611,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
-  cpi->td.mb.delta_qindex = 0;
+  cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0;
 
   if (!frame_params.show_existing_frame) {
     cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
@@ -1628,7 +1676,24 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
 
   // Leave a signal for a higher level caller about if this frame is droppable
   if (*size > 0) {
-    cpi->droppable = is_frame_droppable(&cpi->svc, &ext_flags->refresh_frame);
+    cpi->droppable =
+        is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame);
+  }
+
+  // For SVC: keep track of the (unscaled) source corresponding to the
+  // refresh of LAST reference (base temporal layer- TL0). Copy only for the
+  // top spatial enhancement layer so all spatial layers of the next
+  // superframe have last_source to be aligned with previous TL0 superframe.
+  // Avoid cases where resolution changes for unscaled source (top spatial
+  // layer).
+  if (cpi->ppi->use_svc &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+      cpi->svc.temporal_layer_id == 0 &&
+      cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width &&
+      cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) {
+    aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+    aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+    aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0);
   }
 
   return AOM_CODEC_OK;
diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index 45f774d81..c1d14d134 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h
@@ -95,12 +95,12 @@ int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const COMPRESSOR_STAGE compressor_stage);
 
 static AOM_INLINE int is_frame_droppable(
-    const SVC *const svc,
+    const RTC_REF *const rtc_ref,
     const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
   // Droppable frame is only used by external refresh flags. VoD setting won't
   // trigger its use case.
-  if (svc->set_ref_frame_config)
-    return svc->non_reference_frame;
+  if (rtc_ref->set_ref_frame_config)
+    return rtc_ref->non_reference_frame;
   else if (ext_refresh_frame_flags->update_pending)
     return !(ext_refresh_frame_flags->alt_ref_frame ||
              ext_refresh_frame_flags->alt2_ref_frame ||
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 2a395aa30..6700669a3 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -224,7 +224,7 @@ void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
  * \param[in]     mi_col      Block column (in "MI_SIZE" units) index
  * \param[out]    num_planes  Number of image planes (e.g. Y,U,V)
  *
- * \return No return value but updates macroblock and thread data
+ * \remark No return value but updates macroblock and thread data
  * related to the q / q delta to be used.
  */
 static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
@@ -242,7 +242,16 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
 
   const int delta_q_res = delta_q_info->delta_q_res;
   int current_qindex = cm->quant_params.base_qindex;
-  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+  if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode ==
+                                   DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+    const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+    const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+    const int sb_cols =
+        CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
+    const int sb_index = sb_row * sb_cols + sb_col;
+    current_qindex =
+        cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index];
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
     if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
       const int block_wavelet_energy_level =
           av1_block_wavelet_energy_level(cpi, x, sb_size);
@@ -268,11 +277,18 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
     current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
   }
 
+  x->rdmult_cur_qindex = current_qindex;
   MACROBLOCKD *const xd = &x->e_mbd;
-  current_qindex = av1_adjust_q_from_delta_q_res(
+  const int adjusted_qindex = av1_adjust_q_from_delta_q_res(
       delta_q_res, xd->current_base_qindex, current_qindex);
+  if (cpi->use_ducky_encode) {
+    assert(adjusted_qindex == current_qindex);
+  }
+  current_qindex = adjusted_qindex;
 
   x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+  x->rdmult_delta_qindex = x->delta_qindex;
+
   av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
   xd->mi[0]->current_qindex = current_qindex;
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
@@ -524,14 +540,14 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
 
   // Initialize the flag to skip cdef to 1.
   if (sf->rt_sf.skip_cdef_sb) {
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
     // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
     // "blocks".
-    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
     for (int r = 0; r < block64_in_sb; ++r) {
       for (int c = 0; c < block64_in_sb; ++c) {
         const int idx_in_sb =
             r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
-        if (mi[idx_in_sb]) mi[idx_in_sb]->skip_cdef_curr_sb = 1;
+        if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1;
       }
     }
   }
@@ -544,20 +560,6 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, nonrd_use_partition_time);
 #endif
-
-  if (sf->rt_sf.skip_cdef_sb) {
-    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
-    // "blocks".
-    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
-    const int skip = mi[0]->skip_cdef_curr_sb;
-    for (int r = 0; r < block64_in_sb; ++r) {
-      for (int c = 0; c < block64_in_sb; ++c) {
-        const int idx_in_sb =
-            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
-        if (mi[idx_in_sb]) mi[idx_in_sb]->skip_cdef_curr_sb = skip;
-      }
-    }
-  }
 }
 
 // This function initializes the stats for encode_rd_sb.
@@ -609,12 +611,131 @@ static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
   (void)gather_tpl_data;
 #endif
 
+  x->reuse_inter_pred = false;
   x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
   reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
   av1_zero(x->picked_ref_frames_mask);
   av1_invalid_rd_stats(rd_cost);
 }
 
+#if !CONFIG_REALTIME_ONLY
+static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td,
+                                        const TileDataEnc *tile_data,
+                                        SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                        RD_STATS *rd_cost, int mi_row,
+                                        int mi_col, int delta_qp_ofs) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const TileInfo *tile_info = &tile_data->tile_info;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  assert(delta_q_info->delta_q_present_flag);
+  const int delta_q_res = delta_q_info->delta_q_res;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int use_simple_motion_search =
+      (sf->part_sf.simple_motion_search_split ||
+       sf->part_sf.simple_motion_search_prune_rect ||
+       sf->part_sf.simple_motion_search_early_term_none ||
+       sf->part_sf.ml_early_term_after_part_split_level) &&
+      !frame_is_intra_only(cm);
+  if (use_simple_motion_search) {
+    av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree,
+                                             mi_row, mi_col);
+  }
+
+  int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  current_qindex = av1_adjust_q_from_delta_q_res(
+      delta_q_res, xd->current_base_qindex, current_qindex);
+
+  x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  xd->mi[0]->current_qindex = current_qindex;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+  // keep track of any non-zero delta-q used
+  td->deltaq_used |= (x->delta_qindex != 0);
+
+  if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+    const int delta_lf_res = delta_q_info->delta_lf_res;
+    const int lfmask = ~(delta_lf_res - 1);
+    const int delta_lf_from_base =
+        ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+    const int8_t delta_lf =
+        (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+    const int frame_lf_count =
+        av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+    const int mib_size = cm->seq_params->mib_size;
+
+    // pre-set the delta lf for loop filter. Note that this value is set
+    // before mi is assigned for each block in current superblock
+    for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+      for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+        const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+        mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+        }
+      }
+    }
+  }
+
+  x->reuse_inter_pred = false;
+  x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+  reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+  av1_zero(x->picked_ref_frames_mask);
+  av1_invalid_rd_stats(rd_cost);
+}
+
+static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td,
+                       TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                       int mi_col, BLOCK_SIZE bsize,
+                       SIMPLE_MOTION_DATA_TREE *sms_tree,
+                       SB_FIRST_PASS_STATS *sb_org_stats) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  RD_STATS rdc_winner, cur_rdc;
+  av1_invalid_rd_stats(&rdc_winner);
+
+  int best_qindex = td->mb.rdmult_delta_qindex;
+  const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12;
+  const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12;
+  const int step = cm->delta_q_info.delta_q_res;
+
+  for (int sweep_qp_delta = start; sweep_qp_delta <= end;
+       sweep_qp_delta += step) {
+    sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row,
+                                mi_col, sweep_qp_delta);
+
+    const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+    const int backup_current_qindex =
+        cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+    av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col);
+    av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+    cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex;
+
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(bsize);
+    av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                          &cur_rdc, cur_rdc, pc_root, sms_tree, NULL,
+                          SB_DRY_PASS, NULL);
+
+    if ((rdc_winner.rdcost > cur_rdc.rdcost) ||
+        (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) &&
+         rdc_winner.rdcost == cur_rdc.rdcost)) {
+      rdc_winner = cur_rdc;
+      best_qindex = x->rdmult_delta_qindex + sweep_qp_delta;
+    }
+  }
+
+  return best_qindex;
+}
+#endif  //! CONFIG_REALTIME_ONLY
+
 /*!\brief Encode a superblock (RD-search-based)
  *
  * \ingroup partition_search
@@ -674,6 +795,14 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
                          &dummy_rate, &dummy_dist, 1, pc_root);
     av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
   } else {
+    SB_FIRST_PASS_STATS *sb_org_stats = NULL;
+
+    if (cpi->oxcf.sb_qp_sweep) {
+      CHECK_MEM_ERROR(
+          cm, sb_org_stats,
+          (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(SB_FIRST_PASS_STATS)));
+      av1_backup_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+    }
     // The most exhaustive recursive partition search
     SuperBlockEnc *sb_enc = &x->sb_enc;
     // No stats for overlay frames. Exclude key frame.
@@ -696,6 +825,31 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
     const int num_passes =
         cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
 
+    if (cpi->oxcf.sb_qp_sweep &&
+        !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+          cpi->oxcf.gf_cfg.lag_in_frames == 0) &&
+        cm->delta_q_info.delta_q_present_flag) {
+      assert(x->rdmult_delta_qindex == x->delta_qindex);
+      assert(sb_org_stats);
+
+      const int best_qp_diff =
+          sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root,
+                      sb_org_stats) -
+          x->rdmult_delta_qindex;
+
+      sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc,
+                                  mi_row, mi_col, best_qp_diff);
+
+      const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+      const int backup_current_qindex =
+          cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+      av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+          backup_current_qindex;
+    }
     if (num_passes == 1) {
 #if CONFIG_PARTITION_SEARCH_ORDER
       if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) {
@@ -738,6 +892,8 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
                             &dummy_rdc, dummy_rdc, pc_root_p1, sms_root, NULL,
                             SB_WET_PASS, NULL);
     }
+    aom_free(sb_org_stats);
+
     // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
     sb_enc->tpl_data_count = 0;
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -781,6 +937,89 @@ static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
     return 0;
 }
 
+/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
+                                             int mi_col) {
+  if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128)
+                                   ? (cm->seq_params->mib_size >> 1)
+                                   : cm->seq_params->mib_size;
+  const int num_blk_64x64_cols =
+      (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+  const int num_blk_64x64_rows =
+      (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+  const int blk_64x64_col_index = mi_col / blk_64x64_in_mis;
+  const int blk_64x64_row_index = mi_row / blk_64x64_in_mis;
+  uint64_t curr_sb_sad = UINT64_MAX;
+  const uint64_t *const src_sad_blk_64x64_data =
+      &cpi->src_sad_blk_64x64[blk_64x64_col_index +
+                              blk_64x64_row_index * num_blk_64x64_cols];
+  if (cm->seq_params->sb_size == BLOCK_128X128 &&
+      blk_64x64_col_index + 1 < num_blk_64x64_cols &&
+      blk_64x64_row_index + 1 < num_blk_64x64_rows) {
+    // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the
+    // superblock
+    curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] +
+                  src_sad_blk_64x64_data[num_blk_64x64_cols] +
+                  src_sad_blk_64x64_data[num_blk_64x64_cols + 1];
+  } else if (cm->seq_params->sb_size == BLOCK_64X64) {
+    curr_sb_sad = src_sad_blk_64x64_data[0];
+  }
+  return curr_sb_sad;
+}
+
+/*!\brief Determine whether grading content can be skipped based on sad stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
+                                                  MACROBLOCK *const x,
+                                                  int mi_row, int mi_col) {
+  if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    return true;
+  const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col);
+  if (curr_sb_sad == UINT64_MAX) return true;
+  if (curr_sb_sad == 0) {
+    x->content_state_sb.source_sad_nonrd = kZeroSad;
+    return false;
+  }
+  AV1_COMMON *const cm = &cpi->common;
+  bool do_calc_src_content = true;
+
+  if (cpi->oxcf.speed < 9) return do_calc_src_content;
+
+  // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size.
+  if (AOMMIN(cm->width, cm->height) < 360) {
+    // Derive Average 64x64 block source SAD from SB source SAD
+    const uint64_t avg_64x64_blk_sad =
+        (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2)
+                                                   : curr_sb_sad;
+
+    // The threshold is determined based on kLowSad and kHighSad threshold and
+    // test results.
+    const uint64_t thresh_low = 15000;
+    const uint64_t thresh_high = 40000;
+
+    if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) {
+      do_calc_src_content = false;
+      // Note: set x->content_state_sb.source_sad_rd as well if this is extended
+      // to RTC rd path.
+      x->content_state_sb.source_sad_nonrd = kMedSad;
+    }
+  }
+
+  return do_calc_src_content;
+}
+
 /*!\brief Determine whether grading content is needed based on sf and frame stat
  *
  * \ingroup partition_search
@@ -789,18 +1028,25 @@ static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
  */
 // TODO(any): consolidate sfs to make interface cleaner
 static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
-                                               MACROBLOCK *const x, int mi_row,
-                                               int mi_col) {
+                                               MACROBLOCK *const x,
+                                               TileDataEnc *tile_data,
+                                               int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
+  if (cm->current_frame.frame_type == KEY_FRAME ||
+      (cpi->ppi->use_svc &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+    assert(x->content_state_sb.source_sad_nonrd == kMedSad);
+    assert(x->content_state_sb.source_sad_rd == kMedSad);
+    return;
+  }
   bool calc_src_content = false;
 
-  if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
-      cpi->svc.number_spatial_layers <= 1 &&
-      cm->current_frame.frame_type != KEY_FRAME) {
-    if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0)
-      calc_src_content = true;
-    else
+  if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+    if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) {
+      calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col);
+    } else {
       x->content_state_sb.source_sad_nonrd = kZeroSad;
+    }
   } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) &&
              (cm->width * cm->height <= 352 * 288)) {
     if (cpi->rc.frame_source_sad > 0)
@@ -808,7 +1054,8 @@ static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
     else
       x->content_state_sb.source_sad_rd = kZeroSad;
   }
-  if (calc_src_content) av1_source_content_sb(cpi, x, mi_row, mi_col);
+  if (calc_src_content)
+    av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
 }
 
 /*!\brief Encode a superblock row by breaking it into superblocks
@@ -895,7 +1142,7 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
     x->content_state_sb.source_sad_rd = kMedSad;
     x->content_state_sb.lighting_change = 0;
     x->content_state_sb.low_sumdiff = 0;
-    x->force_zeromv_skip = 0;
+    x->force_zeromv_skip_for_sb = 0;
 
     if (cpi->oxcf.mode == ALLINTRA) {
       x->intra_sb_rdmult_modifier = 128;
@@ -911,7 +1158,7 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
     if (seg->enabled) {
       const uint8_t *const map =
           seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
-      const int segment_id =
+      const uint8_t segment_id =
           map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col)
               : 0;
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
@@ -924,7 +1171,7 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
 
     // Grade the temporal variation of the sb, the grade will be used to decide
     // fast mode search strategy for coding blocks
-    grade_source_content_sb(cpi, x, mi_row, mi_col);
+    grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
 
     // encode the superblock
     if (use_nonrd_mode) {
@@ -969,6 +1216,8 @@ void av1_alloc_tile_data(AV1_COMP *cpi) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
 
+  av1_row_mt_mem_dealloc(cpi);
+
   if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
   CHECK_MEM_ERROR(
       cm, cpi->tile_data,
@@ -1370,6 +1619,43 @@ static int allow_deltaq_mode(AV1_COMP *cpi) {
 #endif  // !CONFIG_REALTIME_ONLY
 }
 
+#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000
+#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4
+
+// Populates block level thresholds for force zeromv-skip decision
+static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) {
+  if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return;
+
+  // Threshold for forcing zeromv-skip decision is as below:
+  // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103.
+  // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221
+  // allowing slightly higher error for smaller blocks.
+  // Per Pixel Threshold of 64x64 block        Area of 64x64 block         1  1
+  // ------------------------------------=sqrt(---------------------)=sqrt(-)=-
+  // Per Pixel Threshold of 128x128 block      Area of 128x128 block       4  2
+  // Thus, per pixel thresholds for blocks of size 32x32, 16x16,...  can be
+  // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for
+  // small blocks, the same is clipped to 4.
+  const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF;
+  const int num_128x128_pix =
+      block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128];
+
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+    const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize];
+
+    // Calculate the threshold for zeromv-skip decision based on area of the
+    // partition
+    unsigned int thresh_exit_part_blk =
+        (unsigned int)(thresh_exit_128x128_part *
+                           sqrt((double)num_block_pix / num_128x128_pix) +
+                       0.5);
+    thresh_exit_part_blk = AOMMIN(
+        thresh_exit_part_blk,
+        (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix));
+    cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk;
+  }
+}
+
 /*!\brief Encoder setup(only for the current frame), encoding, and recontruction
  * for a single frame
  *
@@ -1524,8 +1810,11 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
 
   // Fix delta q resolution for the moment
+
   cm->delta_q_info.delta_q_res = 0;
-  if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
+  if (cpi->use_ducky_encode) {
+    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE;
+  } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
     if (deltaq_mode == DELTA_Q_OBJECTIVE)
       cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
     else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
@@ -1633,6 +1922,7 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   // has to be called after 'skip_mode_flag' is initialized.
   av1_initialize_rd_consts(cpi);
   av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex);
+  populate_thresh_to_force_zeromv_skip(cpi);
 
   enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
   enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
@@ -1927,9 +2217,10 @@ void av1_encode_frame(AV1_COMP *cpi) {
   FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
-  features->reduced_tx_set_used = cpi->oxcf.txfm_cfg.reduced_tx_type_set;
+  features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set;
 
   // Make sure segment_id is no larger than last_active_segid.
   if (cm->seg.enabled && cm->seg.update_map) {
@@ -1971,7 +2262,8 @@ void av1_encode_frame(AV1_COMP *cpi) {
     features->interp_filter = SWITCHABLE;
     if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR;
 
-    features->switchable_motion_mode = 1;
+    features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+        features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
 
     rdc->compound_ref_used_flag = 0;
     rdc->skip_mode_used_flag = 0;
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index e011b11a1..c478ef6b4 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -40,6 +40,7 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
     for (col = mi_col / num_mi_h;
          col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
+      assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0);
       geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
       num_of_mi += 1.0;
     }
@@ -53,14 +54,6 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
 
 // TODO(angiebird): Move these function to tpl_model.c
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE int set_deltaq_rdmult(const AV1_COMP *const cpi,
-                                        const MACROBLOCK *const x) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonQuantParams *quant_params = &cm->quant_params;
-  return av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
-                                      quant_params->y_dc_delta_q);
-}
-
 // Return the end column for the current superblock, in unit of TPL blocks.
 static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
                                          int num_mi_w) {
@@ -88,7 +81,7 @@ int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   assert(IMPLIES(cpi->ppi->gf_group.size > 0,
                  cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
-  int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
+  int deltaq_rdmult = set_rdmult(cpi, x, -1);
   if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
   if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
@@ -142,7 +135,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   assert(IMPLIES(cpi->ppi->gf_group.size > 0,
                  cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
-  const int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
+  const int deltaq_rdmult = set_rdmult(cpi, x, -1);
   if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
   if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
     return deltaq_rdmult;
@@ -184,7 +177,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   av1_set_error_per_bit(&x->errorperbit, rdmult);
 #if !CONFIG_RD_COMMAND
   if (bsize == cm->seq_params->sb_size) {
-    const int rdmult_sb = set_deltaq_rdmult(cpi, x);
+    const int rdmult_sb = set_rdmult(cpi, x, -1);
     assert(rdmult_sb == rdmult);
     (void)rdmult_sb;
   }
@@ -204,39 +197,6 @@ static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
   }
 }
 
-static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
-                          const TX_MODE tx_mode) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
-  if (xd->lossless[mbmi->segment_id]) {
-    mbmi->tx_size = TX_4X4;
-  } else if (tx_mode != TX_MODE_SELECT) {
-    mbmi->tx_size = tx_size_from_tx_mode(mbmi->bsize, tx_mode);
-  } else {
-    const BLOCK_SIZE bsize = mbmi->bsize;
-    const TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
-    if (tx_size_wide[min_tx_size] > tx_size_wide[mbmi->tx_size] ||
-        tx_size_high[min_tx_size] > tx_size_high[mbmi->tx_size])
-      mbmi->tx_size = min_tx_size;
-
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
-    if (tx_size_wide[max_tx_size] < tx_size_wide[mbmi->tx_size] ||
-        tx_size_high[max_tx_size] < tx_size_high[mbmi->tx_size])
-      mbmi->tx_size = max_tx_size;
-  }
-  if (is_inter_block(mbmi)) {
-    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-  }
-  const int stride = xd->tx_type_map_stride;
-  const int bw = mi_size_wide[mbmi->bsize];
-  for (int row = 0; row < mi_size_high[mbmi->bsize]; ++row) {
-    memset(xd->tx_type_map + row * stride, DCT_DCT,
-           bw * sizeof(xd->tx_type_map[0]));
-  }
-  av1_zero(txfm_info->blk_skip);
-  txfm_info->skip_txfm = 0;
-}
-
 // This function will copy the best reference mode information from
 // MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
 static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
@@ -309,7 +269,6 @@ void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
           seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
       mi_addr->segment_id =
           map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
-      reset_tx_size(x, mi_addr, x->txfm_search_params.tx_mode_search_type);
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
@@ -324,10 +283,10 @@ void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
 
     if (!dry_run && !mi_addr->skip_txfm) {
       int cdf_num;
-      const int spatial_pred = av1_get_spatial_seg_pred(
+      const uint8_t spatial_pred = av1_get_spatial_seg_pred(
           cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
-      const int coded_id = av1_neg_interleave(mi_addr->segment_id, spatial_pred,
-                                              seg->last_active_segid + 1);
+      const uint8_t coded_id = av1_neg_interleave(
+          mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1);
       int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id];
       td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost;
 
@@ -337,7 +296,7 @@ void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
                                mi_col)
               : 0;
       const int use_tmp_pred = pred_segment_id == mi_addr->segment_id;
-      const int tmp_pred_ctx = av1_get_pred_context_seg_id(xd);
+      const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd);
       td->rd_counts.seg_tmp_pred_cost[1] +=
           x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred];
       if (!use_tmp_pred) {
@@ -950,8 +909,10 @@ void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
       TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
           row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost;
-      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost;
+      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost
+                                      << TPL_DEP_COST_SCALE_LOG2;
+      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost
+                                      << TPL_DEP_COST_SCALE_LOG2;
       memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
       mi_count++;
       count++;
@@ -1020,7 +981,7 @@ int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
       mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
       srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS);
       srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS);
-      srcrf_rate += (double)this_stats->srcrf_rate;
+      srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
 #ifndef NDEBUG
       mi_count++;
 #endif
@@ -1309,56 +1270,130 @@ void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
               CFL_ALPHABET_SIZE);
 }
 
+// Check neighbor blocks' motion information.
+static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride,
+                                 const TileInfo *const tile_info, int mi_row,
+                                 int mi_col) {
+  int is_above_low_motion = 1;
+  int is_left_low_motion = 1;
+  const int thr = 24;
+
+  // Check above block.
+  if (mi_row > tile_info->mi_row_start) {
+    const MB_MODE_INFO *above_mbmi = mi[-mi_stride];
+    const int_mv above_mv = above_mbmi->mv[0];
+    if (above_mbmi->mode >= INTRA_MODE_END &&
+        (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr))
+      is_above_low_motion = 0;
+  }
+
+  // Check left block.
+  if (mi_col > tile_info->mi_col_start) {
+    const MB_MODE_INFO *left_mbmi = mi[-1];
+    const int_mv left_mv = left_mbmi->mv[0];
+    if (left_mbmi->mode >= INTRA_MODE_END &&
+        (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr))
+      is_left_low_motion = 0;
+  }
+
+  return (is_above_low_motion && is_left_low_motion);
+}
+
+// Check this block's motion in a fast way.
+static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y,
+                                       int src_ystride,
+                                       const uint8_t *last_src_y,
+                                       int last_src_ystride, int mi_row,
+                                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+  unsigned int blk_sad = INT_MAX;
+  if (cpi->src_sad_blk_64x64 != NULL) {
+    const int sb_size_by_mb = (bsize == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols =
+        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sbi_col = mi_col / sb_size_by_mb;
+    const int sbi_row = mi_row / sb_size_by_mb;
+    blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+  } else {
+    blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                          last_src_ystride);
+  }
+
+  // Search 4 1-away points.
+  const uint8_t *const search_pos[4] = {
+    last_src_y - last_src_ystride,
+    last_src_y - 1,
+    last_src_y + 1,
+    last_src_y + last_src_ystride,
+  };
+  unsigned int sad_arr[4];
+  cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos,
+                                 last_src_ystride, sad_arr);
+
+  blk_sad = (blk_sad * 5) >> 3;
+  return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] &&
+          blk_sad < sad_arr[2] && blk_sad < sad_arr[3]);
+}
+
 // Grade the temporal variation of the source by comparing the current sb and
 // its collocated block in the last frame.
-void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                           int mi_col) {
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+                           int mi_row, int mi_col) {
+  if (cpi->last_source->y_width != cpi->source->y_width ||
+      cpi->last_source->y_height != cpi->source->y_height)
+    return;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+
   unsigned int tmp_sse;
   unsigned int tmp_variance;
   const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
   uint8_t *src_y = cpi->source->y_buffer;
-  int src_ystride = cpi->source->y_stride;
+  const int src_ystride = cpi->source->y_stride;
+  const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2);
   uint8_t *last_src_y = cpi->last_source->y_buffer;
-  int last_src_ystride = cpi->last_source->y_stride;
-  const int offset = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
-  uint64_t avg_source_sse_threshold[2] = { 100000,   // ~5*5*(64*64)
-                                           36000 };  // ~3*3*(64*64)
+  const int last_src_ystride = cpi->last_source->y_stride;
+  const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2);
+  uint64_t avg_source_sse_threshold_verylow = 10000;     // ~1.5*1.5*(64*64)
+  uint64_t avg_source_sse_threshold_low[2] = { 100000,   // ~5*5*(64*64)
+                                               36000 };  // ~3*3*(64*64)
+
   uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
   uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
-#if CONFIG_AV1_HIGHBITDEPTH
-  MACROBLOCKD *xd = &x->e_mbd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
-#endif
-  src_y += offset;
-  last_src_y += offset;
+  src_y += src_offset;
+  last_src_y += last_src_offset;
   tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
                                             last_src_ystride, &tmp_sse);
   // rd thresholds
-  if (tmp_sse < avg_source_sse_threshold[1])
+  if (tmp_sse < avg_source_sse_threshold_low[1])
     x->content_state_sb.source_sad_rd = kLowSad;
 
   // nonrd thresholds
-  if (tmp_sse == 0)
+  if (tmp_sse == 0) {
     x->content_state_sb.source_sad_nonrd = kZeroSad;
-  else if (tmp_sse < avg_source_sse_threshold[0])
+    return;
+  }
+  if (tmp_sse < avg_source_sse_threshold_verylow)
+    x->content_state_sb.source_sad_nonrd = kVeryLowSad;
+  else if (tmp_sse < avg_source_sse_threshold_low[0])
     x->content_state_sb.source_sad_nonrd = kLowSad;
   else if (tmp_sse > avg_source_sse_threshold_high)
     x->content_state_sb.source_sad_nonrd = kHighSad;
 
   // Detect large lighting change.
   // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
-  if (tmp_sse > 0) {
-    if (tmp_variance < (tmp_sse >> 1) &&
-        (tmp_sse - tmp_variance) > sum_sq_thresh)
-      x->content_state_sb.lighting_change = 1;
-    if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
-      x->content_state_sb.low_sumdiff = 1;
-  }
+  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+    x->content_state_sb.lighting_change = 1;
+  if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
+    x->content_state_sb.low_sumdiff = 1;
 
-  if (cpi->last_source->y_width != cpi->source->y_width ||
-      cpi->last_source->y_height != cpi->source->y_height)
+  if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad ||
+      cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1)
     return;
-  if (!cpi->sf.rt_sf.use_rtc_tf) return;
 
   // In-place temporal filter. If psnr calculation is enabled, we store the
   // source for that.
@@ -1367,10 +1402,35 @@ void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   const unsigned int nmean2 = tmp_sse - tmp_variance;
   const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
                                          cm->seq_params->bit_depth);
-  const unsigned int threshold = 3 * ac_q_step * ac_q_step / 2;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME],
+                                          0, cm->seq_params->bit_depth);
+
+  const unsigned int threshold =
+      (cpi->sf.rt_sf.use_rtc_tf == 1)
+          ? (clamp(avg_q_step, 250, 1000)) * ac_q_step
+          : 250 * ac_q_step;
 
   // TODO(yunqing): use a weighted sum instead of averaging in filtering.
   if (tmp_variance <= threshold && nmean2 <= 15) {
+    // Check neighbor blocks. If neighbor blocks aren't low-motion blocks,
+    // skip temporal filtering for this block.
+    MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                        get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+    const TileInfo *const tile_info = &tile_data->tile_info;
+    const int is_neighbor_blocks_low_motion = check_neighbor_blocks(
+        mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col);
+    if (!is_neighbor_blocks_low_motion) return;
+
+    // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB
+    // size.
+    // Test several nearby points. If non-zero mv exists, don't do temporal
+    // filtering.
+    const int is_this_blk_low_motion = fast_detect_non_zero_motion(
+        cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col);
+
+    if (!is_this_blk_low_motion) return;
+
     const int shift_x[2] = { 0, cpi->source->subsampling_x };
     const int shift_y[2] = { 0, cpi->source->subsampling_y };
     const uint8_t h = block_size_high[bsize];
@@ -1451,8 +1511,12 @@ void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
 
   sb_fp_stats->fc = *td->counts;
 
-  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
-         sizeof(sb_fp_stats->inter_mode_rd_models));
+  // Don't copy in row_mt case, otherwise run into data race. No behavior change
+  // in row_mt case.
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+           sizeof(sb_fp_stats->inter_mode_rd_models));
+  }
 
   memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
          sizeof(sb_fp_stats->thresh_freq_fact));
@@ -1484,8 +1548,11 @@ void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
 
   *td->counts = sb_fp_stats->fc;
 
-  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
-         sizeof(sb_fp_stats->inter_mode_rd_models));
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+           sizeof(sb_fp_stats->inter_mode_rd_models));
+  }
+
   memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
          sizeof(sb_fp_stats->thresh_freq_fact));
 
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 3a0df601c..29350d70f 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -297,15 +297,29 @@ static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
   }
 }
 
-static AOM_INLINE int set_segment_rdmult(const AV1_COMP *const cpi,
-                                         MACROBLOCK *const x,
-                                         int8_t segment_id) {
+static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
-  av1_init_plane_quantizers(cpi, x, segment_id, 0);
-  const int segment_qindex =
-      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-  return av1_compute_rd_mult(cpi,
-                             segment_qindex + cm->quant_params.y_dc_delta_q);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+  const FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+
+  int qindex;
+  if (segment_id >= 0) {
+    qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+  } else {
+    qindex = quant_params->base_qindex + x->rdmult_delta_qindex +
+             quant_params->y_dc_delta_q;
+  }
+
+  return av1_compute_rd_mult(
+      qindex, bit_depth, update_type, layer_depth, boost_index, frame_type,
+      cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi));
 }
 
 static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
@@ -391,8 +405,8 @@ void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
 void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
                          int wt_left, int wt_tr);
 
-void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                           int mi_col);
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+                           int mi_row, int mi_col);
 
 void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
                     int mi_row, int mi_col);
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index b58d13d5d..b819e8244 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -56,7 +56,7 @@ struct encode_b_args {
   const struct AV1_COMP *cpi;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  int8_t *skip;
+  uint8_t *skip;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   RUN_TYPE dry_run;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 3dfe0cf6c..78eb4117f 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -16,6 +16,7 @@
 #include <time.h>
 #include <stdlib.h>
 
+#include "av1/common/scale.h"
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
@@ -94,33 +95,33 @@ FILE *yuv_rec_file;
 FILE *yuv_denoised_file = NULL;
 #endif
 
-static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
+static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case AOME_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case AOME_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case AOME_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
-    case THREEFOUR:
+    case AOME_THREEFOUR:
       *hr = 3;
       *hs = 4;
       break;
-    case ONEFOUR:
+    case AOME_ONEFOUR:
       *hr = 1;
       *hs = 4;
       break;
-    case ONEEIGHT:
+    case AOME_ONEEIGHT:
       *hr = 1;
       *hs = 8;
       break;
-    case ONETWO:
+    case AOME_ONETWO:
       *hr = 1;
       *hs = 2;
       break;
@@ -136,30 +137,32 @@ int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
   const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
   if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) {
-    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    unsigned char *const active_map_4x4 = cpi->active_map.map;
     const int mi_rows = mi_params->mi_rows;
     const int mi_cols = mi_params->mi_cols;
     const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
     const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
-    cpi->active_map.update = 1;
+    cpi->active_map.update = 0;
+    assert(mi_rows % 2 == 0);
+    assert(mi_cols % 2 == 0);
     if (new_map_16x16) {
-      int r, c;
-      for (r = 0; r < mi_rows; ++r) {
-        for (c = 0; c < mi_cols; ++c) {
-          active_map_8x8[r * mi_cols + c] =
-              new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)]
-                  ? AM_SEGMENT_ID_ACTIVE
-                  : AM_SEGMENT_ID_INACTIVE;
+      for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+        for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+          const uint8_t val = new_map_16x16[r * cols + c]
+                                  ? AM_SEGMENT_ID_ACTIVE
+                                  : AM_SEGMENT_ID_INACTIVE;
+          active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val;
+          active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val;
+          active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val;
+          active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val;
         }
       }
       cpi->active_map.enabled = 1;
-    } else {
-      cpi->active_map.enabled = 0;
     }
     return 0;
-  } else {
-    return -1;
   }
+
+  return -1;
 }
 
 int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
@@ -172,23 +175,32 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
     const int mi_cols = mi_params->mi_cols;
     const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
     const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+    assert(mi_rows % 2 == 0);
+    assert(mi_cols % 2 == 0);
 
     memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
     if (cpi->active_map.enabled) {
-      int r, c;
-      for (r = 0; r < mi_rows; ++r) {
-        for (c = 0; c < mi_cols; ++c) {
+      for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+        for (int c = 0; c < (mi_cols >> col_scale); ++c) {
           // Cyclic refresh segments are considered active despite not having
           // AM_SEGMENT_ID_ACTIVE
-          new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |=
-              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+          uint8_t temp = 0;
+          temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] !=
+                  AM_SEGMENT_ID_INACTIVE;
+          new_map_16x16[r * cols + c] |= temp;
         }
       }
     }
     return 0;
-  } else {
-    return -1;
   }
+
+  return -1;
 }
 
 void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) {
@@ -224,6 +236,36 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm,
   return uncompressed_frame_size / (double)encoded_frame_size;
 }
 
+static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs,
+                                     int num_tiles_lg, int tile_col_row) {
+  CommonTileParams *const tiles = &cm->tiles;
+  int i, start_sb;
+  int size_sb = num_sbs >> num_tiles_lg;
+  int res_sbs = num_sbs - (size_sb << num_tiles_lg);
+  int num_tiles = 1 << num_tiles_lg;
+  int inc_index = num_tiles - res_sbs;
+
+  tiles->uniform_spacing = 0;
+
+  for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) {
+    if (i == inc_index) ++size_sb;
+    if (tile_col_row)
+      tiles->col_start_sb[i] = start_sb;
+    else
+      tiles->row_start_sb[i] = start_sb;
+
+    start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+  }
+
+  if (tile_col_row) {
+    tiles->cols = i;
+    tiles->col_start_sb[i] = num_sbs;
+  } else {
+    tiles->rows = i;
+    tiles->row_start_sb[i] = num_sbs;
+  }
+}
+
 static void set_tile_info(AV1_COMMON *const cm,
                           const TileConfig *const tile_cfg) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -233,14 +275,23 @@ static void set_tile_info(AV1_COMMON *const cm,
 
   av1_get_tile_limits(cm);
 
+  int sb_cols =
+      CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
   // configure tile columns
   if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) {
     tiles->uniform_spacing = 1;
     tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols);
+    // Add a special case to handle super resolution
+    sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator);
+    int min_log2_cols = 0;
+    for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) {
+    }
+    tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols);
+
     tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
+  } else if (tile_cfg->tile_widths[0] < 0) {
+    auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1);
   } else {
-    int sb_cols =
-        CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
     int size_sb, j = 0;
     tiles->uniform_spacing = 0;
     for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
@@ -256,12 +307,14 @@ static void set_tile_info(AV1_COMMON *const cm,
                           tiles);
 
   // configure tile rows
+  int sb_rows =
+      CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
   if (tiles->uniform_spacing) {
     tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows);
     tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
+  } else if (tile_cfg->tile_heights[0] < 0) {
+    auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0);
   } else {
-    int sb_rows =
-        CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
     int size_sb, j = 0;
     for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
       tiles->row_start_sb[i] = start_sb;
@@ -350,6 +403,24 @@ static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
   } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0,
                               2)) {
     level = SEQ_LEVEL_6_2;
+  } else if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0,
+                              2)) {
+    level = SEQ_LEVEL_7_0;
+  } else if (does_level_match(width, height, init_framerate, 16384, 8704, 60.0,
+                              2)) {
+    level = SEQ_LEVEL_7_1;
+  } else if (does_level_match(width, height, init_framerate, 16384, 8704, 120.0,
+                              2)) {
+    level = SEQ_LEVEL_7_2;
+  } else if (does_level_match(width, height, init_framerate, 32768, 17408, 30.0,
+                              2)) {
+    level = SEQ_LEVEL_8_0;
+  } else if (does_level_match(width, height, init_framerate, 32768, 17408, 60.0,
+                              2)) {
+    level = SEQ_LEVEL_8_1;
+  } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+                              120.0, 2)) {
+    level = SEQ_LEVEL_8_2;
   }
 
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
@@ -370,7 +441,8 @@ static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
 }
 
 void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
-                               const AV1EncoderConfig *oxcf, int use_svc) {
+                               const AV1EncoderConfig *oxcf,
+                               int disable_frame_id_numbers) {
   SequenceHeader *const seq = &ppi->seq_params;
   const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
   const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
@@ -385,7 +457,7 @@ void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
   seq->frame_id_numbers_present_flag =
       !seq->reduced_still_picture_hdr &&
       !oxcf->tile_cfg.enable_large_scale_tile &&
-      tool_cfg->error_resilient_mode && !use_svc;
+      tool_cfg->error_resilient_mode && !disable_frame_id_numbers;
   if (seq->reduced_still_picture_hdr) {
     seq->order_hint_info.enable_order_hint = 0;
     seq->force_screen_content_tools = 2;
@@ -543,18 +615,20 @@ static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   alloc_compressor_data(cpi);
 
-  av1_update_film_grain_parameters(cpi, oxcf);
-
   // Single thread case: use counts in common.
   cpi->td.counts = &cpi->counts;
 
-  // Set init SVC parameters.
-  cpi->svc.set_ref_frame_config = 0;
-  cpi->svc.non_reference_frame = 0;
+  // Init SVC parameters.
   cpi->svc.number_spatial_layers = 1;
   cpi->svc.number_temporal_layers = 1;
   cm->spatial_layer_id = 0;
   cm->temporal_layer_id = 0;
+  // Init rtc_ref parameters.
+  cpi->ppi->rtc_ref.set_ref_frame_config = 0;
+  cpi->ppi->rtc_ref.non_reference_frame = 0;
+  cpi->ppi->rtc_ref.ref_frame_comp[0] = 0;
+  cpi->ppi->rtc_ref.ref_frame_comp[1] = 0;
+  cpi->ppi->rtc_ref.ref_frame_comp[2] = 0;
 
   // change includes all joint functionality
   av1_change_config(cpi, oxcf, false);
@@ -641,7 +715,8 @@ void av1_change_config_seq(struct AV1_PRIMARY *ppi,
         (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
             ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
             : 0;
-    av1_init_seq_coding_tools(ppi, oxcf, ppi->use_svc);
+    av1_init_seq_coding_tools(
+        ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config);
   }
   seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
 
@@ -664,6 +739,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
   RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  FeatureFlags *const features = &cm->features;
 
   // in case of LAP, lag in frames is set according to number of lap buffers
   // calculated at init time. This stores and restores LAP's lag in frames to
@@ -673,9 +749,10 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
     lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
   }
 
+  cpi->oxcf = *oxcf;
+
   av1_update_film_grain_parameters(cpi, oxcf);
 
-  cpi->oxcf = *oxcf;
   // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize
   // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure
   // that any analysis (e.g. TPL) happening outside the main encoding loop still
@@ -691,7 +768,8 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
          sizeof(level_params->target_seq_level_idx));
   level_params->keep_level_stats = 0;
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    if (level_params->target_seq_level_idx[i] <= SEQ_LEVELS) {
+    if (level_params->target_seq_level_idx[i] < SEQ_LEVELS ||
+        level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) {
       level_params->keep_level_stats |= 1u << i;
       if (!level_params->level_info[i]) {
         CHECK_MEM_ERROR(cm, level_params->level_info[i],
@@ -717,12 +795,12 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
   refresh_frame->golden_frame = false;
   refresh_frame->bwd_ref_frame = false;
 
-  cm->features.refresh_frame_context =
+  features->refresh_frame_context =
       (oxcf->tool_cfg.frame_parallel_decoding_mode)
           ? REFRESH_FRAME_CONTEXT_DISABLED
           : REFRESH_FRAME_CONTEXT_BACKWARD;
   if (oxcf->tile_cfg.enable_large_scale_tile)
-    cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+    features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   if (x->palette_buffer == NULL) {
     CHECK_MEM_ERROR(cm, x->palette_buffer,
@@ -770,9 +848,10 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
   rc->worst_quality = rc_cfg->worst_allowed_q;
   rc->best_quality = rc_cfg->best_allowed_q;
 
-  cm->features.interp_filter =
+  features->interp_filter =
       oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
-  cm->features.switchable_motion_mode = 1;
+  features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+      features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
 
   if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) {
     cm->render_width = frm_dim_cfg->render_width;
@@ -803,7 +882,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
 
   set_tile_info(cm, &cpi->oxcf.tile_cfg);
 
-  if (!cpi->svc.set_ref_frame_config)
+  if (!cpi->ppi->rtc_ref.set_ref_frame_config)
     cpi->ext_flags.refresh_frame.update_pending = 0;
   cpi->ext_flags.refresh_frame_context_pending = 0;
 
@@ -894,124 +973,129 @@ AV1_PRIMARY *av1_create_primary_compressor(
     }
   }
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  ppi->fn_ptr[BT].sdf = SDF;                                    \
-  ppi->fn_ptr[BT].sdaf = SDAF;                                  \
-  ppi->fn_ptr[BT].vf = VF;                                      \
-  ppi->fn_ptr[BT].svf = SVF;                                    \
-  ppi->fn_ptr[BT].svaf = SVAF;                                  \
-  ppi->fn_ptr[BT].sdx4df = SDX4DF;                              \
-  ppi->fn_ptr[BT].jsdaf = JSDAF;                                \
-  ppi->fn_ptr[BT].jsvaf = JSVAF;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                            \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                          \
+  ppi->fn_ptr[BT].vf = VF;                                              \
+  ppi->fn_ptr[BT].svf = SVF;                                            \
+  ppi->fn_ptr[BT].svaf = SVAF;                                          \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                        \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;                                        \
+  ppi->fn_ptr[BT].sdx3df = SDX3DF;
 
 // Realtime mode doesn't use 4x rectangular blocks.
 #if !CONFIG_REALTIME_ONLY
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
       aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
-      aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
+      aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance4x16)
 
   BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
       aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
-      aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
+      aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x4)
 
   BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
       aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
-      aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
+      aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance8x32)
 
   BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
       aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
-      aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
+      aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x8)
 
   BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
       aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
-      aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
+      aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x64)
 
   BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
       aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
-      aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
+      aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x16)
 #endif  // !CONFIG_REALTIME_ONLY
 
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
-      aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
+      aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg,
       aom_dist_wtd_sub_pixel_avg_variance128x128)
 
   BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
       aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
-      aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
+      aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance128x64)
 
   BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
       aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
-      aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
+      aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x128)
 
   BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
       aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
-      aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
+      aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x16)
 
   BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
       aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
-      aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
+      aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x32)
 
   BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
       aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
-      aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
+      aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x32)
 
   BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
       aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
-      aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
+      aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x64)
 
   BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
       aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
-      aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
+      aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x32)
 
   BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
       aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
-      aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
+      aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x64)
 
   BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
       aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
-      aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
+      aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x16)
 
   BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
       aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
-      aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
+      aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x8)
 
   BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
       aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
-      aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
+      aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance8x16)
 
   BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
       aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
-      aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
+      aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x8)
 
   BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
       aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
-      aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
+      aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x4)
 
   BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
       aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
-      aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
+      aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x8)
 
   BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
       aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
-      aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
+      aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x4)
 
 #if !CONFIG_REALTIME_ONLY
 #define OBFP(BT, OSDF, OVF, OSVF) \
@@ -1210,13 +1294,14 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
                                 BufferPool *const pool, COMPRESSOR_STAGE stage,
                                 int lap_lag_in_frames) {
   AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
-  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
-  if (!cm) return NULL;
+  if (!cpi) return NULL;
 
   av1_zero(*cpi);
 
   cpi->ppi = ppi;
+
+  AV1_COMMON *volatile const cm = &cpi->common;
   cm->seq_params = &ppi->seq_params;
   cm->error =
       (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error));
@@ -1335,8 +1420,17 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
   av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
   av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
 
+  int max_mi_cols = mi_params->mi_cols;
+  int max_mi_rows = mi_params->mi_rows;
+  if (oxcf->frm_dim_cfg.forced_max_frame_width) {
+    max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width);
+  }
+  if (oxcf->frm_dim_cfg.forced_max_frame_height) {
+    max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height);
+  }
+
   CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
-                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
+                  aom_calloc((max_mi_rows * max_mi_cols) >> 2,
                              sizeof(*cpi->consec_zero_mv)));
 
   cpi->mb_weber_stats = NULL;
@@ -1346,8 +1440,8 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
     const int bsize = BLOCK_16X16;
     const int w = mi_size_wide[bsize];
     const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    const int num_cols = (max_mi_cols + w - 1) / w;
+    const int num_rows = (max_mi_rows + h - 1) / h;
     CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
                     aom_calloc(num_rows * num_cols,
                                sizeof(*cpi->ssim_rdmult_scaling_factors)));
@@ -1548,12 +1642,17 @@ void av1_remove_compressor(AV1_COMP *cpi) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
+  pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_;
   pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
   pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
   if (enc_row_mt_mutex_ != NULL) {
     pthread_mutex_destroy(enc_row_mt_mutex_);
     aom_free(enc_row_mt_mutex_);
   }
+  if (enc_row_mt_cond_ != NULL) {
+    pthread_cond_destroy(enc_row_mt_cond_);
+    aom_free(enc_row_mt_cond_);
+  }
   if (gm_mt_mutex_ != NULL) {
     pthread_mutex_destroy(gm_mt_mutex_);
     aom_free(gm_mt_mutex_);
@@ -1918,14 +2017,22 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy,
 static void init_ref_frame_bufs(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int i;
-  BufferPool *const pool = cm->buffer_pool;
-  cm->cur_frame = NULL;
+  if (cm->cur_frame) {
+    cm->cur_frame->ref_count--;
+    cm->cur_frame = NULL;
+  }
   for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = NULL;
+    if (cm->ref_frame_map[i]) {
+      cm->ref_frame_map[i]->ref_count--;
+      cm->ref_frame_map[i] = NULL;
+    }
   }
+#ifndef NDEBUG
+  BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    pool->frame_bufs[i].ref_count = 0;
+    assert(pool->frame_bufs[i].ref_count == 0);
   }
+#endif
 }
 
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
@@ -2108,13 +2215,36 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
+static INLINE int extend_borders_mt(const AV1_COMP *cpi,
+                                    MULTI_THREADED_MODULES stage, int plane) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cpi->mt_info.num_mod_workers[stage] < 2) return 0;
+  switch (stage) {
+    // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled,
+    // multi-thread frame border extension along with loop filter frame.
+    // As loop-filtering of a superblock row modifies the pixels of the
+    // above superblock row, border extension requires that loop filtering
+    // of the current and above superblock row is complete.
+    case MOD_LPF: return 0;
+    case MOD_CDEF:
+      return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame &&
+             !is_restoration_used(cm) && !av1_superres_scaled(cm);
+    case MOD_LR:
+      return is_restoration_used(cm) &&
+             (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE);
+    default: assert(0);
+  }
+  return 0;
+}
+
 /*!\brief Select and apply cdef filters and switchable restoration filters
  *
  * \ingroup high_level_algo
  */
 static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
                                    MACROBLOCKD *xd, int use_restoration,
-                                   int use_cdef) {
+                                   int use_cdef,
+                                   unsigned int skip_apply_postproc_filters) {
 #if !CONFIG_REALTIME_ONLY
   if (use_restoration)
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
@@ -2135,16 +2265,21 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
     // Find CDEF parameters
     av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
                     cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult,
-                    cpi->sf.rt_sf.skip_cdef_sb, cpi->rc.frames_since_key,
-                    cpi->oxcf.tool_cfg.cdef_control, use_screen_content_model,
-                    cpi->svc.non_reference_frame);
+                    cpi->sf.rt_sf.skip_cdef_sb, cpi->oxcf.tool_cfg.cdef_control,
+                    use_screen_content_model,
+                    cpi->ppi->rtc_ref.non_reference_frame);
 
     // Apply the filter
-    if (!cpi->svc.non_reference_frame) {
+    if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) {
+      assert(!cpi->ppi->rtc_ref.non_reference_frame);
       if (num_workers > 1) {
+        // Extension of frame borders is multi-threaded along with cdef.
+        const int do_extend_border =
+            extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0);
         av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
                           cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
-                          num_workers, av1_cdef_init_fb_row_mt);
+                          num_workers, av1_cdef_init_fb_row_mt,
+                          do_extend_border);
       } else {
         av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
       }
@@ -2152,14 +2287,14 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, cdef_time);
 #endif
-  } else {
-    cm->cdef_info.cdef_bits = 0;
-    cm->cdef_info.cdef_strengths[0] = 0;
-    cm->cdef_info.nb_cdef_strengths = 1;
-    cm->cdef_info.cdef_uv_strengths[0] = 0;
   }
 
-  av1_superres_post_encode(cpi);
+  const int use_superres = av1_superres_scaled(cm);
+  if (use_superres) {
+    if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) {
+      av1_superres_post_encode(cpi);
+    }
+  }
 
 #if !CONFIG_REALTIME_ONLY
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -2170,21 +2305,22 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
     const int num_workers = mt_info->num_mod_workers[MOD_LR];
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
     av1_pick_filter_restoration(cpi->source, cpi);
-    if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-        cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-      if (num_workers > 1)
+    if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 &&
+        (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+         cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
+      if (num_workers > 1) {
+        // Extension of frame borders is multi-threaded along with loop
+        // restoration filter.
+        const int do_extend_border = 1;
         av1_loop_restoration_filter_frame_mt(
             &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers,
-            &mt_info->lr_row_sync, &cpi->lr_ctxt);
-      else
+            &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border);
+      } else {
         av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
                                           &cpi->lr_ctxt);
+      }
     }
-  } else {
-    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_restoration_time);
@@ -2192,8 +2328,22 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
 #endif  // !CONFIG_REALTIME_ONLY
 }
 
-/*!\brief Select and apply in-loop deblocking filters, cdef filters, and
- * restoration filters
+static void extend_frame_borders(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // TODO(debargha): Fix mv search range on encoder side
+  for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+    const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) ||
+                                    extend_borders_mt(cpi, MOD_LR, plane);
+    if (!extend_border_done) {
+      const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf;
+      aom_extend_frame_borders_plane_row(ybf, plane, 0,
+                                         ybf->crop_heights[plane > 0]);
+    }
+  }
+}
+
+/*!\brief Select and apply deblocking filters, cdef filters, and restoration
+ * filters.
  *
  * \ingroup high_level_algo
  */
@@ -2202,50 +2352,51 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
   const int num_workers = mt_info->num_mod_workers[MOD_LPF];
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
 
   assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg),
                  cm->features.coded_lossless && cm->features.all_lossless));
 
   const int use_loopfilter =
-      !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_cdef = cm->seq_params->enable_cdef &&
-                       !cm->features.coded_lossless && !cm->tiles.large_scale;
+      is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc;
+  const int use_cdef = is_cdef_used(cm);
+  const int use_superres = av1_superres_scaled(cm);
   const int use_restoration = is_restoration_used(cm);
-  // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
-  // lpf_opt_level is set to 1 if transform size search depth in inter blocks
-  // is limited to one as quad loop filtering assumes that all the transform
-  // blocks within a 16x8/8x16/16x16 prediction block are of the same size.
-  // lpf_opt_level = 2 : Filters both chroma planes together, in addition to
-  // enabling dual/quad loop-filtering. This is enabled when lpf pick method
-  // is LPF_PICK_FROM_Q as u and v plane filter levels are equal.
-  int lpf_opt_level = 0;
-  if (is_inter_tx_size_search_level_one(&cpi->sf.tx_sf)) {
-    lpf_opt_level = (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
-  }
 
-  struct loopfilter *lf = &cm->lf;
+  const unsigned int skip_apply_postproc_filters =
+      derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+                                         use_superres, use_restoration);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, loop_filter_time);
 #endif
   if (use_loopfilter) {
     av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
-  } else {
-    lf->filter_level[0] = 0;
-    lf->filter_level[1] = 0;
+    struct loopfilter *lf = &cm->lf;
+    if ((lf->filter_level[0] || lf->filter_level[1]) &&
+        (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) {
+      assert(!cpi->ppi->rtc_ref.non_reference_frame);
+      // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+      // lpf_opt_level is set to 1 if transform size search depth in inter
+      // blocks is limited to one as quad loop filtering assumes that all the
+      // transform blocks within a 16x8/8x16/16x16 prediction block are of the
+      // same size. lpf_opt_level = 2 : Filters both chroma planes together, in
+      // addition to enabling dual/quad loop-filtering. This is enabled when lpf
+      // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are
+      // equal.
+      int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+                               mt_info->workers, num_workers,
+                               &mt_info->lf_row_sync, lpf_opt_level);
+    }
   }
 
-  if ((lf->filter_level[0] || lf->filter_level[1]) &&
-      !cpi->svc.non_reference_frame) {
-    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
-                             mt_info->workers, num_workers,
-                             &mt_info->lf_row_sync, lpf_opt_level);
-  }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_filter_time);
 #endif
 
-  cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef);
+  cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef,
+                         skip_apply_postproc_filters);
 }
 
 static void update_motion_stat(AV1_COMP *const cpi) {
@@ -2306,9 +2457,23 @@ static int encode_without_recode(AV1_COMP *cpi) {
 
   set_size_independent_vars(cpi);
   av1_setup_frame_size(cpi);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
   av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
   av1_set_mv_search_params(cpi);
 
+  if (cm->current_frame.frame_number == 0 && cpi->ppi->use_svc &&
+      cpi->svc.temporal_layer_id == 0) {
+    const SequenceHeader *seq_params = cm->seq_params;
+    if (aom_alloc_frame_buffer(
+            &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width,
+            cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0)) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate buffer for source_last_TL0");
+    }
+  }
+
   if (!cpi->ppi->use_svc) {
     phase_scaler = 8;
     // 2:1 scaling.
@@ -2429,8 +2594,7 @@ static int encode_without_recode(AV1_COMP *cpi) {
   // encoded at high/max QP, and if so, set the q and adjust some rate
   // control parameters.
   if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
-      (cpi->rc.high_source_sad ||
-       (cpi->ppi->use_svc && cpi->svc.high_source_sad_superframe))) {
+      cpi->rc.high_source_sad) {
     if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
       av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
                         q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
@@ -2448,8 +2612,8 @@ static int encode_without_recode(AV1_COMP *cpi) {
   if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
     suppress_active_map(cpi);
     av1_cyclic_refresh_setup(cpi);
-    av1_apply_active_map(cpi);
   }
+  av1_apply_active_map(cpi);
   if (cm->seg.enabled) {
     if (!cm->seg.update_data && cm->prev_frame) {
       segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
@@ -2586,7 +2750,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 
 #if !CONFIG_RD_COMMAND
   // Determine whether to use screen content tools using two fast encoding.
-  if (!cpi->sf.hl_sf.disable_extra_sc_testing)
+  if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode)
     av1_determine_sc_tools_with_encoding(cpi, q);
 #endif  // !CONFIG_RD_COMMAND
 
@@ -2718,12 +2882,8 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
           &cpi->ducky_encode_info.frame_info;
       if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) {
         q = frame_info->q_index;
-
-        // TODO(jingning): Coding block level QP offset is currently disabled
-        // in RC lib.
-        cm->delta_q_info.delta_q_present_flag = 0;
+        cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled;
       }
-      // TODO(angiebird): Implement DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT mode
     }
 
     av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
@@ -3019,24 +3179,16 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
   cm->cur_frame->buf.render_width = cm->render_width;
   cm->cur_frame->buf.render_height = cm->render_height;
 
-  // Pick the loop filter level for the frame.
+  if (!cpi->mt_info.pipeline_lpf_mt_with_enc)
+    set_postproc_filter_default_params(&cpi->common);
+
   if (!cm->features.allow_intrabc) {
     loopfilter_frame(cpi, cm);
-  } else {
-    cm->lf.filter_level[0] = 0;
-    cm->lf.filter_level[1] = 0;
-    cm->cdef_info.cdef_bits = 0;
-    cm->cdef_info.cdef_strengths[0] = 0;
-    cm->cdef_info.nb_cdef_strengths = 1;
-    cm->cdef_info.cdef_uv_strengths[0] = 0;
-    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
 
-  // TODO(debargha): Fix mv search range on encoder side
-  // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm));
-  aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+  if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) {
+    extend_frame_borders(cpi);
+  }
 
 #ifdef OUTPUT_YUV_REC
   aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
@@ -3078,7 +3230,7 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
     PSNR_STATS psnr;
     aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
     DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result;
-    frame_result->global_order_idx = cm->cur_frame->order_hint;
+    frame_result->global_order_idx = cm->cur_frame->display_order_hint;
     frame_result->q_index = cm->quant_params.base_qindex;
     frame_result->rdmult = cpi->rd.RDMULT;
     frame_result->rate = (int)(*size) * 8;
@@ -3255,12 +3407,13 @@ static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) {
   if (cpi->svc.number_spatial_layers == 1 &&
       cpi->svc.number_temporal_layers == 1) {
     // Don't disable on intra_only, scene change (high_source_sad = 1),
-    // or resized frame. To avoid quality loss for now, force enable at
-    // every 8 frames.
+    // or resized frame. To avoid quality loss force enable at
+    // for ~30 frames after key or scene/slide change, and
+    // after 8 frames since last update if frame_source_sad > 0.
     if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
-        rc->high_source_sad || rc->frames_since_key < 10 ||
-        cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 ||
-        cm->current_frame.frame_number % 8 == 0)
+        rc->high_source_sad || rc->frames_since_key < 30 ||
+        cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30 ||
+        (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0))
       return 0;
     else
       return 1;
@@ -3459,7 +3612,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
       DuckyEncodeFrameResult *frame_result =
           &cpi->ducky_encode_info.frame_result;
-      frame_result->global_order_idx = cm->cur_frame->order_hint;
+      frame_result->global_order_idx = cm->cur_frame->display_order_hint;
       frame_result->q_index = cm->quant_params.base_qindex;
       frame_result->rdmult = cpi->rd.RDMULT;
       frame_result->rate = (int)(*size) * 8;
@@ -3524,7 +3677,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   // Never drop on key frame.
   if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR &&
       current_frame->frame_type != KEY_FRAME) {
-    if (cpi->oxcf.rc_cfg.target_bandwidth == 0 || av1_rc_drop_frame(cpi)) {
+    if (av1_rc_drop_frame(cpi)) {
       av1_setup_frame_size(cpi);
       av1_set_mv_search_params(cpi);
       av1_rc_postencode_update_drop_frame(cpi);
@@ -3597,7 +3750,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       break;
     case 1:  // Enable CDF update for all frames.
       if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame &&
-          cpi->svc.non_reference_frame && cpi->rc.frames_since_key > 2)
+          cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2)
         features->disable_cdf_update = 1;
       else
         features->disable_cdf_update = 0;
@@ -3893,8 +4046,11 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
 #endif  //  CONFIG_DENOISE
 
   if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time,
-                         use_highbitdepth, frame_flags))
+                         use_highbitdepth, frame_flags)) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "av1_lookahead_push() failed");
     res = -1;
+  }
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&timer);
   cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
@@ -4198,9 +4354,11 @@ static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
       av1_firstpass_info_move_cur_index(firstpass_info);
     }
 #endif
-    cpi->rc.frames_since_key++;
-    cpi->rc.frames_to_key--;
-    cpi->rc.frames_to_fwd_kf--;
+    if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+      cpi->rc.frames_to_fwd_kf--;
+    }
   }
 }
 
@@ -4210,7 +4368,7 @@ static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
   // We should fix the cpi->common.show_frame flag
   // instead of checking the other condition to update the counter properly.
   if (cpi->common.show_frame ||
-      is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) {
+      is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) {
     // Decrement count down till next gf
     if (cpi->rc.frames_till_gf_update_due > 0)
       cpi->rc.frames_till_gf_update_due--;
@@ -4859,7 +5017,7 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
     return -1;
   } else {
     int ret;
-    if (cm->cur_frame != NULL) {
+    if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) {
       *dest = cm->cur_frame->buf;
       dest->y_width = cm->width;
       dest->y_height = cm->height;
@@ -4874,7 +5032,9 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
 }
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
-  if (cpi->last_show_frame_buf == NULL) return -1;
+  if (cpi->last_show_frame_buf == NULL ||
+      cpi->oxcf.algo_cfg.skip_postproc_filtering)
+    return -1;
 
   *frame = cpi->last_show_frame_buf->buf;
   return 0;
@@ -4895,10 +5055,11 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
 
 int av1_set_internal_size(AV1EncoderConfig *const oxcf,
                           ResizePendingParams *resize_pending_params,
-                          AOM_SCALING horiz_mode, AOM_SCALING vert_mode) {
+                          AOM_SCALING_MODE horiz_mode,
+                          AOM_SCALING_MODE vert_mode) {
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
-  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+  if (horiz_mode > AOME_ONETWO || vert_mode > AOME_ONETWO) return -1;
 
   Scale2Ratio(horiz_mode, &hr, &hs);
   Scale2Ratio(vert_mode, &vr, &vs);
@@ -4907,7 +5068,7 @@ int av1_set_internal_size(AV1EncoderConfig *const oxcf,
   resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs;
   resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs;
 
-  if (horiz_mode != NORMAL || vert_mode != NORMAL) {
+  if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) {
     oxcf->resize_cfg.resize_mode = RESIZE_FIXED;
     oxcf->algo_cfg.enable_tpl_model = 0;
   }
@@ -4975,29 +5136,33 @@ int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
   return AOM_CODEC_OK;
 }
 
-static void svc_set_updates_ref_frame_config(
-    ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, SVC *const svc) {
+static void rtc_set_updates_ref_frame_config(
+    ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags,
+    RTC_REF *const rtc_ref) {
   ext_refresh_frame_flags->update_pending = 1;
-  ext_refresh_frame_flags->last_frame = svc->refresh[svc->ref_idx[0]];
-  ext_refresh_frame_flags->golden_frame = svc->refresh[svc->ref_idx[3]];
-  ext_refresh_frame_flags->bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
-  ext_refresh_frame_flags->alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
-  ext_refresh_frame_flags->alt_ref_frame = svc->refresh[svc->ref_idx[6]];
-  svc->non_reference_frame = 1;
+  ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]];
+  ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]];
+  ext_refresh_frame_flags->bwd_ref_frame =
+      rtc_ref->refresh[rtc_ref->ref_idx[4]];
+  ext_refresh_frame_flags->alt2_ref_frame =
+      rtc_ref->refresh[rtc_ref->ref_idx[5]];
+  ext_refresh_frame_flags->alt_ref_frame =
+      rtc_ref->refresh[rtc_ref->ref_idx[6]];
+  rtc_ref->non_reference_frame = 1;
   for (int i = 0; i < REF_FRAMES; i++) {
-    if (svc->refresh[i] == 1) {
-      svc->non_reference_frame = 0;
+    if (rtc_ref->refresh[i] == 1) {
+      rtc_ref->non_reference_frame = 0;
       break;
     }
   }
 }
 
-static int svc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
+static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
   int ref = AOM_REFFRAME_ALL;
   for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
-    if (!cpi->svc.reference[i]) ref ^= (1 << i);
+    if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i);
   }
   return ref;
 }
@@ -5036,8 +5201,8 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
 
     av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
   } else {
-    if (cpi->svc.set_ref_frame_config) {
-      int ref = svc_set_references_external_ref_frame_config(cpi);
+    if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+      int ref = rtc_set_references_external_ref_frame_config(cpi);
       av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
     }
   }
@@ -5064,8 +5229,9 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
     ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
     ext_refresh_frame_flags->update_pending = 1;
   } else {
-    if (cpi->svc.set_ref_frame_config)
-      svc_set_updates_ref_frame_config(ext_refresh_frame_flags, &cpi->svc);
+    if (cpi->ppi->rtc_ref.set_ref_frame_config)
+      rtc_set_updates_ref_frame_config(ext_refresh_frame_flags,
+                                       &cpi->ppi->rtc_ref);
     else
       ext_refresh_frame_flags->update_pending = 0;
   }
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index bd6c7a24c..d13f08f83 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -104,16 +104,6 @@ typedef struct aom_rational64 {
 } aom_rational64_t;  // alias for struct aom_rational
 
 enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  THREEFOUR = 3,
-  ONEFOUR = 4,
-  ONEEIGHT = 5,
-  ONETWO = 6
-} UENUM1BYTE(AOM_SCALING);
-
-enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
   GOOD,
@@ -200,8 +190,6 @@ enum {
 
 #define MAX_VBR_CORPUS_COMPLEXITY 10000
 
-/*!\cond */
-
 typedef enum {
   MOD_FP,           // First pass
   MOD_TF,           // Temporal filtering
@@ -214,6 +202,7 @@ typedef enum {
   MOD_LR,           // Loop restoration filtering
   MOD_PACK_BS,      // Pack bitstream
   MOD_FRAME_ENC,    // Frame Parallel encode
+  MOD_AI,           // All intra
   NUM_MT_MODULES
 } MULTI_THREADED_MODULES;
 
@@ -244,6 +233,17 @@ typedef enum {
       3, /*!< Disable loopfilter on frames with low motion. */
 } LOOPFILTER_CONTROL;
 
+/*!\enum SKIP_APPLY_POSTPROC_FILTER
+ * \brief This enum controls the application of post-processing filters on a
+ * reconstructed frame.
+ */
+typedef enum {
+  SKIP_APPLY_RESTORATION = 1 << 0,
+  SKIP_APPLY_SUPERRES = 1 << 1,
+  SKIP_APPLY_CDEF = 1 << 2,
+  SKIP_APPLY_LOOPFILTER = 1 << 3,
+} SKIP_APPLY_POSTPROC_FILTER;
+
 /*!
  * \brief Encoder config related to resize.
  */
@@ -573,13 +573,13 @@ typedef struct {
   int drop_frames_water_mark;
   /*!
    * under_shoot_pct indicates the tolerance of the VBR algorithm to
-   * undershoot and is used as a trigger threshold for more agressive
+   * undershoot and is used as a trigger threshold for more aggressive
    * adaptation of Q. It's value can range from 0-100.
    */
   int under_shoot_pct;
   /*!
    * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot
-   * and is used as a trigger threshold for more agressive adaptation of Q.
+   * and is used as a trigger threshold for more aggressive adaptation of Q.
    * It's value can range from 0-1000.
    */
   int over_shoot_pct;
@@ -861,6 +861,12 @@ typedef struct {
    * 3: Loop filter is disables for the frames with low motion
    */
   LOOPFILTER_CONTROL loopfilter_control;
+
+  /*!
+   * Indicates if the application of post-processing filters should be skipped
+   * on reconstructed frame.
+   */
+  bool skip_postproc_filtering;
 } AlgoCfg;
 /*!\cond */
 
@@ -1067,6 +1073,12 @@ typedef struct AV1EncoderConfig {
 
   // Exit the encoder when it fails to encode to a given level.
   int strict_level_conformance;
+
+  // Max depth for the GOP after a key frame
+  int kf_max_pyr_height;
+
+  // A flag to control if we enable the superblock qp sweep for a given lambda
+  int sb_qp_sweep;
   /*!\endcond */
 } AV1EncoderConfig;
 
@@ -1363,8 +1375,8 @@ typedef struct {
 #endif  // CONFIG_MULTITHREAD
   /*!
    * Buffer to store the superblock whose encoding is complete.
-   * cur_col[i] stores the number of superblocks which finished encoding in the
-   * ith superblock row.
+   * num_finished_cols[i] stores the number of superblocks which finished
+   * encoding in the ith superblock row.
    */
   int *num_finished_cols;
   /*!
@@ -1493,11 +1505,27 @@ typedef struct {
    */
   int thread_id_to_tile_id[MAX_NUM_THREADS];
 
+  /*!
+   * num_tile_cols_done[i] indicates the number of tile columns whose encoding
+   * is complete in the ith superblock row.
+   */
+  int *num_tile_cols_done;
+
+  /*!
+   * Number of superblock rows in a frame for which 'num_tile_cols_done' is
+   * allocated.
+   */
+  int allocated_sb_rows;
+
 #if CONFIG_MULTITHREAD
   /*!
    * Mutex lock used while dispatching jobs.
    */
   pthread_mutex_t *mutex_;
+  /*!
+   *  Condition variable used to dispatch loopfilter jobs.
+   */
+  pthread_cond_t *cond_;
 #endif
 
   /**
@@ -1678,6 +1706,12 @@ typedef struct MultiThreadInfo {
    * Buffers to be stored/restored before/after parallel encode.
    */
   RestoreStateBuffers restore_state_buf;
+
+  /*!
+   * In multi-threaded realtime encoding with row-mt enabled, pipeline
+   * loop-filtering after encoding.
+   */
+  int pipeline_lpf_mt_with_enc;
 } MultiThreadInfo;
 
 /*!\cond */
@@ -2336,6 +2370,10 @@ typedef struct DuckyEncodeFrameInfo {
   DUCKY_ENCODE_GOP_MODE gop_mode;
   int q_index;
   int rdmult;
+  // These two arrays are equivalent to std::vector<SuperblockEncodeParameters>
+  int *superblock_encode_qindex;
+  int *superblock_encode_rdmult;
+  int delta_q_enabled;
 } DuckyEncodeFrameInfo;
 
 typedef struct DuckyEncodeFrameResult {
@@ -2354,6 +2392,22 @@ typedef struct DuckyEncodeInfo {
 /*!\endcond */
 #endif
 
+/*!\cond */
+typedef struct RTC_REF {
+  /*!
+   * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+   * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+   */
+  int reference[INTER_REFS_PER_FRAME];
+  int ref_idx[INTER_REFS_PER_FRAME];
+  int refresh[REF_FRAMES];
+  int set_ref_frame_config;
+  int non_reference_frame;
+  int ref_frame_comp[3];
+  int gld_idx_1layer;
+} RTC_REF;
+/*!\endcond */
+
 /*!
  * \brief Structure to hold data corresponding to an encoded frame.
  */
@@ -2692,6 +2746,11 @@ typedef struct AV1_PRIMARY {
    * found in the frame update type with enum value equal to i
    */
   int valid_gm_model_found[FRAME_UPDATE_TYPES];
+
+  /*!
+   * Struct for the reference structure for RTC.
+   */
+  RTC_REF rtc_ref;
 } AV1_PRIMARY;
 
 /*!
@@ -3377,6 +3436,11 @@ typedef struct AV1_COMP {
   uint64_t *src_sad_blk_64x64;
 
   /*!
+   * SSE between the current frame and the reconstructed last frame
+   */
+  uint64_t rec_sse;
+
+  /*!
    * A flag to indicate whether the encoder is controlled by DuckyEncode or not.
    * 1:yes 0:no
    */
@@ -3393,6 +3457,11 @@ typedef struct AV1_COMP {
    * Frames since last frame with cdf update.
    */
   int frames_since_last_update;
+
+  /*!
+   * Block level thresholds to force zeromv-skip at partition level.
+   */
+  unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL];
 } AV1_COMP;
 
 /*!
@@ -3604,7 +3673,8 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 
 int av1_set_internal_size(AV1EncoderConfig *const oxcf,
                           ResizePendingParams *resize_pending_params,
-                          AOM_SCALING horiz_mode, AOM_SCALING vert_mode);
+                          AOM_SCALING_MODE horiz_mode,
+                          AOM_SCALING_MODE vert_mode);
 
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
@@ -3739,8 +3809,10 @@ static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
 // the frame token allocation.
 static INLINE unsigned int allocated_tokens(const TileInfo *tile,
                                             int sb_size_log2, int num_planes) {
-  int tile_mb_rows = (tile->mi_row_end - tile->mi_row_start + 2) >> 2;
-  int tile_mb_cols = (tile->mi_col_end - tile->mi_col_start + 2) >> 2;
+  int tile_mb_rows =
+      ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2);
+  int tile_mb_cols =
+      ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2);
 
   return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
 }
@@ -3825,9 +3897,11 @@ static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
          cpi->oxcf.gf_cfg.lag_in_frames == 0;
 }
 
-static INLINE int use_one_pass_rt_reference_structure(const AV1_COMP *cpi) {
-  return cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
-         cpi->ppi->number_temporal_layers == 1;
+// Use default/internal reference structure for single-layer RTC.
+static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
+  return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 &&
+         cpi->ppi->number_temporal_layers == 1 &&
+         !cpi->ppi->rtc_ref.set_ref_frame_config;
 }
 
 // Function return size of frame stats buffer
@@ -3876,12 +3950,12 @@ void av1_setup_frame_size(AV1_COMP *cpi);
 
 // Returns 1 if a frame is scaled and 0 otherwise.
 static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
-  return !(cm->superres_upscaled_width == cm->render_width &&
-           cm->superres_upscaled_height == cm->render_height);
+  return cm->superres_upscaled_width != cm->render_width ||
+         cm->superres_upscaled_height != cm->render_height;
 }
 
 static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
-  return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
+  return av1_superres_scaled(cm) || av1_resize_scaled(cm);
 }
 
 // Don't allow a show_existing_frame to coincide with an error resilient
@@ -4046,18 +4120,101 @@ static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
            cpi->common.height != resize_pending_params->height));
 }
 
+// Check if loop filter is used.
+static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) {
+  return !cm->features.coded_lossless && !cm->tiles.large_scale;
+}
+
+// Check if CDEF is used.
+static INLINE int is_cdef_used(const AV1_COMMON *const cm) {
+  return cm->seq_params->enable_cdef && !cm->features.coded_lossless &&
+         !cm->tiles.large_scale;
+}
+
 // Check if loop restoration filter is used.
 static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
   return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
          !cm->tiles.large_scale;
 }
 
+// Checks if post-processing filters need to be applied.
+// NOTE: This function decides if the application of different post-processing
+// filters on the reconstructed frame can be skipped at the encoder side.
+// However the computation of different filter parameters that are signaled in
+// the bitstream is still required.
+static INLINE unsigned int derive_skip_apply_postproc_filters(
+    const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres,
+    int use_restoration) {
+  // Though CDEF parameter selection should be dependent on
+  // deblocked/loop-filtered pixels for cdef_pick_method <=
+  // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the
+  // pixel values that are not loop-filtered in svc real-time encoding mode.
+  // Hence this case is handled separately using the condition below.
+  if (cpi->ppi->rtc_ref.non_reference_frame)
+    return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF);
+
+  if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr)
+    return 0;
+  assert(cpi->oxcf.mode == ALLINTRA);
+
+  // The post-processing filters are applied one after the other in the
+  // following order: deblocking->cdef->superres->restoration. In case of
+  // ALLINTRA encoding, the reconstructed frame is not used as a reference
+  // frame. Hence, the application of these filters can be skipped when
+  // 1. filter parameters of the subsequent stages are not dependent on the
+  // filtered output of the current stage or
+  // 2. subsequent filtering stages are disabled
+  if (use_restoration) return SKIP_APPLY_RESTORATION;
+  if (use_superres) return SKIP_APPLY_SUPERRES;
+  if (use_cdef) {
+    // CDEF parameter selection is not dependent on the deblocked frame if
+    // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking
+    // filters and cdef filters can be skipped in this case.
+    return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+            use_loopfilter)
+               ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF)
+               : SKIP_APPLY_CDEF;
+  }
+  if (use_loopfilter) return SKIP_APPLY_LOOPFILTER;
+
+  return 0;  // All post-processing stages disabled.
+}
+
+static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) {
+  struct loopfilter *const lf = &cm->lf;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  RestorationInfo *const rst_info = cm->rst_info;
+
+  lf->filter_level[0] = 0;
+  lf->filter_level[1] = 0;
+  cdef_info->cdef_bits = 0;
+  cdef_info->cdef_strengths[0] = 0;
+  cdef_info->nb_cdef_strengths = 1;
+  cdef_info->cdef_uv_strengths[0] = 0;
+  rst_info[0].frame_restoration_type = RESTORE_NONE;
+  rst_info[1].frame_restoration_type = RESTORE_NONE;
+  rst_info[2].frame_restoration_type = RESTORE_NONE;
+}
+
 static INLINE int is_inter_tx_size_search_level_one(
     const TX_SPEED_FEATURES *tx_sf) {
   return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 &&
           tx_sf->inter_tx_size_search_init_depth_sqr >= 1);
 }
 
+static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) {
+  int lpf_opt_level = 0;
+  if (is_inter_tx_size_search_level_one(&sf->tx_sf))
+    lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
+  return lpf_opt_level;
+}
+
+// Enable switchable motion mode only if warp and OBMC tools are allowed
+static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion,
+                                                     bool enable_obmc) {
+  return (allow_warped_motion || enable_obmc);
+}
+
 #if CONFIG_AV1_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
   return (!cpi->ppi->use_svc ||
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 72c823eae..f4c345f1e 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -269,6 +269,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
   aom_free_frame_buffer(&cpi->orig_source);
+  aom_free_frame_buffer(&cpi->svc.source_last_TL0);
 
   free_token_info(token_info);
 
@@ -294,6 +295,8 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   }
 
   if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+  aom_free(cpi->svc.layer_context);
+  cpi->svc.layer_context = NULL;
 
   if (cpi->consec_zero_mv) {
     aom_free(cpi->consec_zero_mv);
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 75dfd9b80..beb8f5486 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -432,9 +432,9 @@ void av1_apply_active_map(AV1_COMP *cpi) {
 
   if (cpi->active_map.update) {
     if (cpi->active_map.enabled) {
-      for (i = 0;
-           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
-           ++i)
+      const int num_mis =
+          cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+      for (i = 0; i < num_mis; ++i)
         if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
       av1_enable_segmentation(seg);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
@@ -636,7 +636,6 @@ void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
 void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
                                       const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
-  cpi->oxcf = *oxcf;
   const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
 
   if (cpi->film_grain_table) {
@@ -733,9 +732,16 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
             aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
-          const bool has_optimized_scaler = av1_has_optimized_scaler(
-              cm->width, cm->height, new_fb->buf.y_crop_width,
+          bool has_optimized_scaler = av1_has_optimized_scaler(
+              ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width,
               new_fb->buf.y_crop_height);
+          if (num_planes > 1) {
+            has_optimized_scaler =
+                has_optimized_scaler &&
+                av1_has_optimized_scaler(
+                    ref->uv_crop_width, ref->uv_crop_height,
+                    new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height);
+          }
 #if CONFIG_AV1_HIGHBITDEPTH
           if (use_optimized_scaler && has_optimized_scaler &&
               cm->seq_params->bit_depth == AOM_BITS_8)
@@ -796,7 +802,10 @@ BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
                ? BLOCK_128X128
                : BLOCK_64X64;
   } else if (oxcf->mode == REALTIME) {
-    return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+    if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN)
+      return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64;
+    else
+      return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
   }
 
   // TODO(any): Possibly could improve this with a heuristic.
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 44294dbff..92e69dad1 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -42,29 +42,34 @@ extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
 static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
   unsigned char *const seg_map = cpi->enc_seg.map;
   int i;
+  const int num_mis =
+      cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
   if (cpi->active_map.enabled || cpi->active_map.update)
-    for (i = 0;
-         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
+    for (i = 0; i < num_mis; ++i)
       if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
         seg_map[i] = AM_SEGMENT_ID_ACTIVE;
 }
 
-static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                                 int height) {
+// Returns 'size' in the number of Mode Info (MI) units. 'size' is either the
+// width or height.
+static AOM_INLINE int size_in_mi(int size) {
   // Ensure that the decoded width and height are both multiples of
   // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
   // subsampling is used).
   // This simplifies the implementation of various experiments,
   // eg. cdef, which operates on units of 8x8 luma pixels.
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+  const int aligned_size = ALIGN_POWER_OF_TWO(size, 3);
+  return aligned_size >> MI_SIZE_LOG2;
+}
 
-  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height) {
+  mi_params->mi_cols = size_in_mi(width);
+  mi_params->mi_rows = size_in_mi(height);
   mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
 
-  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
-  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
+  mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
   mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
 
   const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
@@ -122,14 +127,15 @@ static AOM_INLINE void init_buffer_indices(
   force_intpel_info->rate_size = 0;
 }
 
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  ppi->fn_ptr[BT].sdf = SDF;                                           \
-  ppi->fn_ptr[BT].sdaf = SDAF;                                         \
-  ppi->fn_ptr[BT].vf = VF;                                             \
-  ppi->fn_ptr[BT].svf = SVF;                                           \
-  ppi->fn_ptr[BT].svaf = SVAF;                                         \
-  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
-  ppi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                                   \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                                 \
+  ppi->fn_ptr[BT].vf = VF;                                                     \
+  ppi->fn_ptr[BT].svf = SVF;                                                   \
+  ppi->fn_ptr[BT].svaf = SVAF;                                                 \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                             \
+  ppi->fn_ptr[BT].sdx3df = SDX3DF;                                             \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                               \
   ppi->fn_ptr[BT].jsvaf = JSVAF;
 
 #define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
@@ -140,6 +146,7 @@ static AOM_INLINE void init_buffer_indices(
       aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT,                \
       aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT,            \
       aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD,                        \
+      aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD,                        \
       aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,              \
       aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
 
@@ -229,71 +236,93 @@ static AOM_INLINE void init_buffer_indices(
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d)
 
 #if !CONFIG_REALTIME_ONLY
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d)
 #endif
 
 MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
@@ -970,6 +999,7 @@ static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
 static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
   int i;
   AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
   for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     cm->global_motion[i] = default_warp_params;
   }
@@ -977,8 +1007,9 @@ static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
 
   av1_set_speed_features_framesize_independent(cpi, cpi->speed);
   av1_set_rd_speed_thresholds(cpi);
-  cm->features.interp_filter = SWITCHABLE;
-  cm->features.switchable_motion_mode = 1;
+  features->interp_filter = SWITCHABLE;
+  features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+      features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc);
 }
 
 static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
@@ -1000,7 +1031,7 @@ static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
 static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) {
   const SequenceHeader *const seq_params = cpi->common.seq_params;
   return is_one_pass_rt_params(cpi) &&
-         use_one_pass_rt_reference_structure(cpi) &&
+         use_rtc_reference_structure_one_layer(cpi) &&
          (seq_params->order_hint_info.enable_order_hint == 0) &&
          cpi->rt_reduce_num_ref_buffers;
 }
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 7be776821..c0b0e302d 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -12,6 +12,7 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/thread_common.h"
 
+#include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
@@ -24,6 +25,7 @@
 #include "av1/encoder/global_motion.h"
 #include "av1/encoder/global_motion_facade.h"
 #include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/picklpf.h"
 #include "av1/encoder/rdopt.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/temporal_filter.h"
@@ -221,6 +223,11 @@ static void row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
   }
 }
 
+static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) {
+  return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows,
+                           cm->seq_params->mib_size_log2);
+}
+
 static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
                              int alloc_row_ctx) {
   struct AV1Common *cm = &cpi->common;
@@ -229,6 +236,8 @@ static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
   const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
 
+  av1_row_mt_mem_dealloc(cpi);
+
   // Allocate memory for row based multi-threading
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
@@ -247,10 +256,16 @@ static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
       }
     }
   }
+  const int sb_rows = get_sb_rows_in_frame(cm);
+  CHECK_MEM_ERROR(
+      cm, enc_row_mt->num_tile_cols_done,
+      aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows));
+
   enc_row_mt->allocated_tile_cols = tile_cols;
   enc_row_mt->allocated_tile_rows = tile_rows;
   enc_row_mt->allocated_rows = max_rows;
   enc_row_mt->allocated_cols = max_cols - 1;
+  enc_row_mt->allocated_sb_rows = sb_rows;
 }
 
 void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
@@ -270,10 +285,13 @@ void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
       if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx);
     }
   }
+  aom_free(enc_row_mt->num_tile_cols_done);
+  enc_row_mt->num_tile_cols_done = NULL;
   enc_row_mt->allocated_rows = 0;
   enc_row_mt->allocated_cols = 0;
   enc_row_mt->allocated_tile_cols = 0;
   enc_row_mt->allocated_tile_rows = 0;
+  enc_row_mt->allocated_sb_rows = 0;
 }
 
 static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
@@ -433,6 +451,40 @@ static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
 }
 #endif
 
+static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
+                                    AV1EncRowMultiThreadInfo *enc_row_mt,
+                                    int mib_size_log2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
+  const int sb_rows = get_sb_rows_in_frame(cm);
+  AV1LfMTInfo *cur_job_info;
+  (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+
+  while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+    LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data;
+    const int lpf_opt_level = cur_job_info->lpf_opt_level;
+    (void)sb_rows;
+#if CONFIG_MULTITHREAD
+    const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2;
+    const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
+    // Wait for current and next superblock row to finish encoding.
+    pthread_mutex_lock(enc_row_mt_mutex_);
+    while (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+           enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols) {
+      pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
+    }
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    av1_thread_loop_filter_rows(
+        lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+        cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+        lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
+        mib_size_log2);
+  }
+}
+
 static int enc_row_mt_worker_hook(void *arg1, void *unused) {
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
@@ -440,10 +492,17 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
   int thread_id = thread_data->thread_id;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
   int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
 #endif
   (void)unused;
+  // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+  // allocation.
+  thread_data->td->rt_pc_root =
+      cpi->sf.rt_sf.use_nonrd_pick_mode
+          ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
+          : NULL;
 
   assert(cur_tile_id != -1);
 
@@ -480,6 +539,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
     const int tile_row = tile_info->tile_row;
     const int tile_col = tile_info->tile_col;
     ThreadData *td = thread_data->td;
+    const int sb_row = current_mi_row >> mib_size_log2;
 
     assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
 
@@ -510,11 +570,23 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 #endif
     this_tile->abs_sum_level += td->abs_sum_level;
     row_mt_sync->num_threads_working--;
+    enc_row_mt->num_tile_cols_done[sb_row]++;
 #if CONFIG_MULTITHREAD
+    pthread_cond_broadcast(enc_row_mt->cond_);
     pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
   }
-
+  if (cpi->mt_info.pipeline_lpf_mt_with_enc &&
+      (cm->lf.filter_level[PLANE_TYPE_Y] ||
+       cm->lf.filter_level[PLANE_TYPE_UV])) {
+    // Loop-filter a superblock row if encoding of the current and next
+    // superblock row is complete.
+    // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving
+    // encoding and loop filter stage.
+    launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
+  }
+  av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
+                             0);
   return 1;
 }
 
@@ -527,6 +599,12 @@ static int enc_worker_hook(void *arg1, void *unused) {
   int t;
 
   (void)unused;
+  // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+  // allocation.
+  thread_data->td->rt_pc_root =
+      cpi->sf.rt_sf.use_nonrd_pick_mode
+          ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
+          : NULL;
 
   for (t = thread_data->start; t < tile_rows * tile_cols;
        t += cpi->mt_info.num_workers) {
@@ -540,6 +618,9 @@ static int enc_worker_hook(void *arg1, void *unused) {
     av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
+  av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
+                             0);
+
   return 1;
 }
 
@@ -595,6 +676,11 @@ void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
                       aom_malloc(sizeof(*(enc_row_mt->mutex_))));
       if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
     }
+    if (enc_row_mt->cond_ == NULL) {
+      CHECK_MEM_ERROR(cm, enc_row_mt->cond_,
+                      aom_malloc(sizeof(*(enc_row_mt->cond_))));
+      if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL);
+    }
   }
 
   if (!is_first_pass) {
@@ -1221,9 +1307,6 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
           thread_data->td->mb.txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
     }
-
-    av1_free_pc_tree_recursive(thread_data->td->rt_pc_root,
-                               av1_num_planes(&cpi->common), 0, 0);
   }
 }
 
@@ -1231,10 +1314,16 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                            int num_workers) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
     EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
 
+    // Initialize loopfilter data
+    thread_data->lf_sync = &mt_info->lf_row_sync;
+    thread_data->lf_data = &thread_data->lf_sync->lfdata[i];
+    loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd);
+
     worker->hook = hook;
     worker->data1 = thread_data;
     worker->data2 = NULL;
@@ -1322,13 +1411,6 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
             thread_data->td->mb.tmp_pred_bufs[j];
       }
     }
-
-    // Preallocate the pc_tree for realtime coding to reduce the cost of memory
-    // allocation.
-    thread_data->td->rt_pc_root =
-        cpi->sf.rt_sf.use_nonrd_pick_mode
-            ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
-            : NULL;
   }
 }
 
@@ -1452,22 +1534,34 @@ void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
   for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
 }
 
-// Computes the maximum number of sb_rows for row multi-threading of encoding
-// stage
-static AOM_INLINE void compute_max_sb_rows_cols(AV1_COMP *cpi, int *max_sb_rows,
-                                                int *max_sb_cols) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tiles.cols;
+// Computes the maximum number of sb rows and sb_cols across tiles which are
+// used to allocate memory for multi-threaded encoding with row-mt=1.
+static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm,
+                                                int *max_sb_rows_in_tile,
+                                                int *max_sb_cols_in_tile) {
   const int tile_rows = cm->tiles.rows;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int num_mi_rows = cm->mi_params.mi_rows;
+  const int *const row_start_sb = cm->tiles.row_start_sb;
   for (int row = 0; row < tile_rows; row++) {
-    for (int col = 0; col < tile_cols; col++) {
-      const int tile_index = row * cm->tiles.cols + col;
-      const TileInfo *const tile_info = &cpi->tile_data[tile_index].tile_info;
-      const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, tile_info);
-      const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
-      *max_sb_rows = AOMMAX(*max_sb_rows, num_sb_rows_in_tile);
-      *max_sb_cols = AOMMAX(*max_sb_cols, num_sb_cols_in_tile);
-    }
+    const int mi_row_start = row_start_sb[row] << mib_size_log2;
+    const int mi_row_end =
+        AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+    const int num_sb_rows_in_tile =
+        CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2);
+    *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile);
+  }
+
+  const int tile_cols = cm->tiles.cols;
+  const int num_mi_cols = cm->mi_params.mi_cols;
+  const int *const col_start_sb = cm->tiles.col_start_sb;
+  for (int col = 0; col < tile_cols; col++) {
+    const int mi_col_start = col_start_sb[col] << mib_size_log2;
+    const int mi_col_end =
+        AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols);
+    const int num_sb_cols_in_tile =
+        CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2);
+    *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile);
   }
 }
 
@@ -1498,57 +1592,123 @@ int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
 
 // Computes the maximum number of mb_rows for row multi-threading of firstpass
 // stage
-static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *const cm,
-                                             const TileDataEnc *const tile_data,
-                                             const BLOCK_SIZE fp_block_size) {
-  const int tile_cols = cm->tiles.cols;
+static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm,
+                                             BLOCK_SIZE fp_block_size) {
   const int tile_rows = cm->tiles.rows;
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int num_mi_rows = cm->mi_params.mi_rows;
+  const int *const row_start_sb = cm->tiles.row_start_sb;
   int max_mb_rows = 0;
+
   for (int row = 0; row < tile_rows; row++) {
-    for (int col = 0; col < tile_cols; col++) {
-      const int tile_index = row * cm->tiles.cols + col;
-      const TileInfo *const tile_info = &tile_data[tile_index].tile_info;
-      const int num_mb_rows_in_tile =
-          av1_get_unit_rows_in_tile(tile_info, fp_block_size);
-      max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
-    }
+    const int mi_row_start = row_start_sb[row] << mib_size_log2;
+    const int mi_row_end =
+        AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+    const int num_mb_rows_in_tile =
+        CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2);
+    max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
   }
   return max_mb_rows;
 }
 #endif
 
+static void lpf_pipeline_mt_init(AV1_COMP *cpi) {
+  // Pipelining of loop-filtering after encoding is enabled when loop-filter
+  // level is chosen based on quantizer and frame type. It is disabled in case
+  // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage
+  // decides the filter level. Loop-filtering is disabled in case
+  // of non-reference frames and for frames with intra block copy tool enabled.
+  AV1_COMMON *cm = &cpi->common;
+  const int use_loopfilter = is_loopfilter_used(cm);
+  const int use_superres = av1_superres_scaled(cm);
+  const int use_cdef = is_cdef_used(cm);
+  const int use_restoration = is_restoration_used(cm);
+
+  const unsigned int skip_apply_postproc_filters =
+      derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+                                         use_superres, use_restoration);
+  cpi->mt_info.pipeline_lpf_mt_with_enc =
+      (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) &&
+      (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) &&
+      (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) &&
+      !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc &&
+      ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0);
+
+  if (!cpi->mt_info.pipeline_lpf_mt_with_enc) return;
+
+  set_postproc_filter_default_params(cm);
+
+  if (!use_loopfilter) return;
+
+  const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick;
+  assert(method == LPF_PICK_FROM_Q);
+  assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY);
+
+  av1_pick_filter_level(cpi->source, cpi, method);
+
+  struct loopfilter *lf = &cm->lf;
+  const int plane_start = 0;
+  const int plane_end = av1_num_planes(cm);
+  int planes_to_lf[MAX_MB_PLANE];
+  if ((lf->filter_level[PLANE_TYPE_Y] || lf->filter_level[PLANE_TYPE_UV]) &&
+      check_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end)) {
+    int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+    assert(lpf_opt_level == 2);
+
+    const int start_mi_row = 0;
+    const int end_mi_row = start_mi_row + cm->mi_params.mi_rows;
+
+    av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+    assert(cpi->mt_info.num_mod_workers[MOD_ENC] ==
+           cpi->mt_info.num_mod_workers[MOD_LPF]);
+    loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf,
+                              cpi->mt_info.num_mod_workers[MOD_LPF],
+                              &cpi->mt_info.lf_row_sync, lpf_opt_level,
+                              cm->seq_params->mib_size_log2);
+  }
+}
+
 void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
+  const int sb_rows_in_frame = get_sb_rows_in_frame(cm);
   int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
-  int max_sb_rows = 0, max_sb_cols = 0;
+  int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0;
   int num_workers = mt_info->num_mod_workers[MOD_ENC];
 
-  assert(IMPLIES(cpi->tile_data == NULL,
-                 cpi->allocated_tiles < tile_cols * tile_rows));
-  if (cpi->allocated_tiles < tile_cols * tile_rows) {
-    av1_row_mt_mem_dealloc(cpi);
+  compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile);
+  const bool alloc_row_mt_mem =
+      (enc_row_mt->allocated_tile_cols != tile_cols ||
+       enc_row_mt->allocated_tile_rows != tile_rows ||
+       enc_row_mt->allocated_rows != max_sb_rows_in_tile ||
+       enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) ||
+       enc_row_mt->allocated_sb_rows != sb_rows_in_frame);
+  const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+  assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+  if (alloc_tile_data) {
     av1_alloc_tile_data(cpi);
   }
 
-  av1_init_tile_data(cpi);
-
-  compute_max_sb_rows_cols(cpi, &max_sb_rows, &max_sb_cols);
-
-  if (enc_row_mt->allocated_tile_cols != tile_cols ||
-      enc_row_mt->allocated_tile_rows != tile_rows ||
-      enc_row_mt->allocated_rows != max_sb_rows ||
-      enc_row_mt->allocated_cols != (max_sb_cols - 1)) {
-    av1_row_mt_mem_dealloc(cpi);
-    row_mt_mem_alloc(cpi, max_sb_rows, max_sb_cols,
+  assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+  if (alloc_row_mt_mem) {
+    row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile,
                      cpi->oxcf.algo_cfg.cdf_update_mode);
   }
 
+  lpf_pipeline_mt_init(cpi);
+
+  av1_init_tile_data(cpi);
+
   memset(thread_id_to_tile_id, -1,
          sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+  memset(enc_row_mt->num_tile_cols_done, 0,
+         sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame);
 
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
@@ -1558,7 +1718,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
 
       // Initialize num_finished_cols to -1 for all rows.
       memset(row_mt_sync->num_finished_cols, -1,
-             sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows);
+             sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile);
       row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
       row_mt_sync->num_threads_working = 0;
       row_mt_sync->intrabc_extra_top_right_sb_delay =
@@ -1593,17 +1753,23 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
   int num_workers = 0;
   int max_mb_rows = 0;
 
-  assert(IMPLIES(cpi->tile_data == NULL,
-                 cpi->allocated_tiles < tile_cols * tile_rows));
-  if (cpi->allocated_tiles < tile_cols * tile_rows) {
-    av1_row_mt_mem_dealloc(cpi);
+  max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size);
+  const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols ||
+                                enc_row_mt->allocated_tile_rows != tile_rows ||
+                                enc_row_mt->allocated_rows != max_mb_rows;
+  const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+  assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+  if (alloc_tile_data) {
     av1_alloc_tile_data(cpi);
   }
 
-  av1_init_tile_data(cpi);
+  assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+  if (alloc_row_mt_mem) {
+    row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
+  }
 
-  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
-  max_mb_rows = fp_compute_max_mb_rows(cm, cpi->tile_data, fp_block_size);
+  av1_init_tile_data(cpi);
 
   // For pass = 1, compute the no. of workers needed. For single-pass encode
   // (pass = 0), no. of workers are already computed.
@@ -1612,13 +1778,6 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
   else
     num_workers = mt_info->num_mod_workers[MOD_FP];
 
-  if (enc_row_mt->allocated_tile_cols != tile_cols ||
-      enc_row_mt->allocated_tile_rows != tile_rows ||
-      enc_row_mt->allocated_rows != max_mb_rows) {
-    av1_row_mt_mem_dealloc(cpi);
-    row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
-  }
-
   memset(thread_id_to_tile_id, -1,
          sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
 
@@ -2258,6 +2417,182 @@ void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
+// Allocate memory for row synchronization
+static void wiener_var_sync_mem_alloc(
+    AV1EncRowMultiThreadSync *const row_mt_sync, AV1_COMMON *const cm,
+    const int rows) {
+#if CONFIG_MULTITHREAD
+  int i;
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                  aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+  if (row_mt_sync->mutex_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+    }
+  }
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                  aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+  if (row_mt_sync->cond_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
+                  aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
+
+  row_mt_sync->rows = rows;
+  // Set up nsync.
+  row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+static void wiener_var_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+      }
+      aom_free(row_mt_sync->mutex_);
+    }
+    if (row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+      }
+      aom_free(row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(row_mt_sync->num_finished_cols);
+
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    av1_zero(*row_mt_sync);
+  }
+}
+
+static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
+                                                  AVxWorkerHook hook,
+                                                  const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread, in this case the preprocessing
+    // stage does not need tiles. So we set it to 0.
+    thread_data->start = 0;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+}
+
+static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  MACROBLOCK *x = &thread_data->td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  AV1EncRowMultiThreadSync *const row_mt_sync = &cpi->tile_data[0].row_mt_sync;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+  double sum_rec_distortion = 0;
+  double sum_est_rate = 0;
+  int has_jobs = 1;
+  while (has_jobs) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    has_jobs = get_next_job(&cpi->tile_data[0], &current_mi_row, mb_step);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    if (!has_jobs) break;
+    // TODO(chengchen): properly accumulate the distortion and rate.
+    av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
+                               qcoeff, dqcoeff, &sum_rec_distortion,
+                               &sum_est_rate);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+  }
+  return 1;
+}
+
+// This function is the multi-threading version of computing the wiener
+// variance.
+// Note that the wiener variance is used for allintra mode (1 pass) and its
+// computation is before the frame encoding, so we don't need to consider
+// the number of tiles, instead we allocate all available threads to
+// the computation.
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+                               double *sum_rec_distortion,
+                               double *sum_est_rate) {
+  (void)sum_rec_distortion;
+  (void)sum_est_rate;
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int tile_cols = 1;
+  const int tile_rows = 1;
+  if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+  CHECK_MEM_ERROR(
+      cm, cpi->tile_data,
+      aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+  cpi->allocated_tiles = tile_cols * tile_rows;
+  cpi->tile_data->tile_info.mi_row_end = cm->mi_params.mi_rows;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &cpi->tile_data[0].row_mt_sync;
+
+  // TODO(chengchen): the memory usage could be improved.
+  const int mi_rows = cm->mi_params.mi_rows;
+  wiener_var_sync_mem_alloc(row_mt_sync, cm, mi_rows);
+
+  row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+  row_mt_sync->num_threads_working = num_workers;
+  row_mt_sync->next_mi_row = 0;
+  memset(row_mt_sync->num_finished_cols, -1,
+         sizeof(*row_mt_sync->num_finished_cols) * num_workers);
+
+  prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+
+  wiener_var_sync_mem_dealloc(row_mt_sync);
+}
+
 // Compare and order tiles based on absolute sum of tx coeffs.
 static int compare_tile_order(const void *a, const void *b) {
   const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
@@ -2732,6 +3067,16 @@ static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
   return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
 }
 
+// Computes num_workers for all intra multi-threading.
+static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  cpi->weber_bsize = BLOCK_8X8;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step;
+  return AOMMIN(num_mb_rows, cpi->oxcf.max_threads);
+}
+
 int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
   int num_mod_workers = 0;
   switch (mod_name) {
@@ -2758,6 +3103,14 @@ int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
     case MOD_FRAME_ENC:
       num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC];
       break;
+    case MOD_AI:
+      if (cpi->oxcf.pass == AOM_RC_ONE_PASS) {
+        num_mod_workers = compute_num_ai_workers(cpi);
+        break;
+      } else {
+        num_mod_workers = 0;
+        break;
+      }
     default: assert(0); break;
   }
   return (num_mod_workers);
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index a1de988b0..6c4bce4db 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -23,6 +23,8 @@ typedef struct EncWorkerData {
   struct AV1_COMP *cpi;
   struct ThreadData *td;
   struct ThreadData *original_td;
+  AV1LfSync *lf_sync;
+  LFWorkerData *lf_data;
   int start;
   int thread_id;
 } EncWorkerData;
@@ -71,6 +73,10 @@ void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync);
 
 #endif  // !CONFIG_REALTIME_ONLY
 
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+                               double *sum_rec_distortion,
+                               double *sum_est_rate);
+
 void av1_tf_do_filtering_mt(AV1_COMP *cpi);
 
 void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 7ad0c8d13..3dee6443b 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -36,6 +36,7 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
@@ -473,8 +474,10 @@ static int firstpass_intra_prediction(
 
   set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale);
   xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
-  xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
-  xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+  if (num_planes > 1) {
+    xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
+    xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+  }
   xd->left_available = (unit_col != 0);
   xd->mi[0]->bsize = bsize;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
@@ -761,8 +764,10 @@ static int firstpass_inter_prediction(
 
     // Reset to last frame as reference buffer.
     xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
-    xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
-    xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+    if (av1_num_planes(&cpi->common) > 1) {
+      xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
+      xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+    }
   } else {
     stats->sr_coded_error += motion_error;
   }
@@ -1196,8 +1201,10 @@ void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
 
     // Adjust to the next column of MBs.
     x->plane[0].src.buf += fp_block_size_width;
-    x->plane[1].src.buf += uv_mb_height;
-    x->plane[2].src.buf += uv_mb_height;
+    if (num_planes > 1) {
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+    }
 
     recon_yoffset += fp_block_size_width;
     src_yoffset += fp_block_size_width;
@@ -1213,8 +1220,18 @@ void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int unit_rows = get_unit_rows(BLOCK_16X16, mi_params->mb_rows);
-  const int unit_cols = get_unit_cols(BLOCK_16X16, mi_params->mb_cols);
+  int max_mb_rows = mi_params->mb_rows;
+  int max_mb_cols = mi_params->mb_cols;
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+    int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+    max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+  }
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+    int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+    max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+  }
+  const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows);
+  const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols);
   setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
   FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
   FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
@@ -1248,10 +1265,21 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
   const BLOCK_SIZE fp_block_size =
       get_fp_block_size(cpi->is_screen_content_type);
 
+  int max_mb_rows = mi_params->mb_rows;
+  int max_mb_cols = mi_params->mb_cols;
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+    int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+    max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+  }
+  if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+    int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+    max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+  }
+
   // Number of rows in the unit size.
-  // Note mi_params->mb_rows and mi_params->mb_cols are in the unit of 16x16.
-  const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
-  const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
+  // Note max_mb_rows and max_mb_cols are in the unit of 16x16.
+  const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows);
+  const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols);
 
   // Set fp_block_size, for the convenience of multi-thread usage.
   cpi->fp_block_size = fp_block_size;
@@ -1267,7 +1295,6 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   if (cpi->allocated_tiles < tile_cols * tile_rows) {
-    av1_row_mt_mem_dealloc(cpi);
     av1_alloc_tile_data(cpi);
   }
 
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index ca4401bc1..d5f750f67 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -355,6 +355,7 @@ typedef struct GF_GROUP {
   int max_layer_depth_allowed;
   // This is currently only populated for AOM_Q mode
   int q_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH];
   int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
   // The frame coding type - inter/intra frame
   FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
@@ -578,7 +579,7 @@ void av1_accumulate_stats(FIRSTPASS_STATS *section,
  * \param[in]    cpi            Top-level encoder structure
  * \param[in]    ts_duration    Duration of the frame / collection of frames
  *
- * \return Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
+ * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
  * is modified to store information computed in this function.
  */
 void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 5e03d79ae..9e84e537d 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -21,50 +21,15 @@
 #include "av1/encoder/global_motion.h"
 
 #include "av1/common/convolve.h"
-#include "av1/common/resize.h"
 #include "av1/common/warped_motion.h"
 
 #include "av1/encoder/segmentation.h"
-#include "av1/encoder/corner_detect.h"
-#include "av1/encoder/corner_match.h"
-#include "av1/encoder/ransac.h"
-
-#define MIN_INLIER_PROB 0.1
 
 #define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
 
 // Border over which to compute the global motion
 #define ERRORADV_BORDER 0
 
-// Number of pyramid levels in disflow computation
-#define N_LEVELS 2
-// Size of square patches in the disflow dense grid
-#define PATCH_SIZE 8
-// Center point of square patch
-#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
-// Step size between patches, lower value means greater patch overlap
-#define PATCH_STEP 1
-// Minimum size of border padding for disflow
-#define MIN_PAD 7
-// Warp error convergence threshold for disflow
-#define DISFLOW_ERROR_TR 0.01
-// Max number of iterations if warp convergence is not found
-#define DISFLOW_MAX_ITR 10
-
-// Struct for an image pyramid
-typedef struct {
-  int n_levels;
-  int pad_size;
-  int has_gradient;
-  int widths[N_LEVELS];
-  int heights[N_LEVELS];
-  int strides[N_LEVELS];
-  int level_loc[N_LEVELS];
-  unsigned char *level_buffer;
-  double *level_dx_buffer;
-  double *level_dy_buffer;
-} ImagePyramid;
-
 int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
   return best_erroradvantage < erroradv_tr &&
          best_erroradvantage * params_cost < erroradv_prod_tr;
@@ -358,39 +323,6 @@ int64_t av1_refine_integerized_param(
   return best_error;
 }
 
-unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) {
-  int i, j;
-  uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
-  uint8_t *buf_8bit = frm->y_buffer_8bit;
-  assert(buf_8bit);
-  if (!frm->buf_8bit_valid) {
-    for (i = 0; i < frm->y_height; ++i) {
-      for (j = 0; j < frm->y_width; ++j) {
-        buf_8bit[i * frm->y_stride + j] =
-            orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
-      }
-    }
-    frm->buf_8bit_valid = 1;
-  }
-  return buf_8bit;
-}
-
-static bool get_inliers_from_indices(MotionModel *params,
-                                     int *correspondences) {
-  int *inliers_tmp = (int *)aom_malloc(2 * MAX_CORNERS * sizeof(*inliers_tmp));
-  if (!inliers_tmp) return false;
-  memset(inliers_tmp, 0, 2 * MAX_CORNERS * sizeof(*inliers_tmp));
-
-  for (int i = 0; i < params->num_inliers; i++) {
-    int index = params->inliers[i];
-    inliers_tmp[2 * i] = correspondences[4 * index];
-    inliers_tmp[2 * i + 1] = correspondences[4 * index + 1];
-  }
-  memcpy(params->inliers, inliers_tmp, sizeof(*inliers_tmp) * 2 * MAX_CORNERS);
-  aom_free(inliers_tmp);
-  return true;
-}
-
 #define FEAT_COUNT_TR 3
 #define SEG_COUNT_TR 0.40
 void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
@@ -420,625 +352,3 @@ void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
   if (seg_count < (width * height * SEG_COUNT_TR))
     memset(segment_map, 1, width * height * sizeof(*segment_map));
 }
-
-static int compute_global_motion_feature_based(
-    TransformationType type, unsigned char *src_buffer, int src_width,
-    int src_height, int src_stride, int *src_corners, int num_src_corners,
-    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
-    MotionModel *params_by_motion, int num_motions) {
-  int i;
-  int num_ref_corners;
-  int num_correspondences;
-  int *correspondences;
-  int ref_corners[2 * MAX_CORNERS];
-  unsigned char *ref_buffer = ref->y_buffer;
-  RansacFunc ransac = av1_get_ransac_type(type);
-
-  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    ref_buffer = av1_downconvert_frame(ref, bit_depth);
-  }
-
-  num_ref_corners =
-      av1_fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
-                             ref->y_stride, ref_corners, MAX_CORNERS);
-
-  // find correspondences between the two images
-  correspondences =
-      (int *)malloc(num_src_corners * 4 * sizeof(*correspondences));
-  if (!correspondences) return 0;
-  num_correspondences = av1_determine_correspondence(
-      src_buffer, (int *)src_corners, num_src_corners, ref_buffer,
-      (int *)ref_corners, num_ref_corners, src_width, src_height, src_stride,
-      ref->y_stride, correspondences);
-
-  ransac(correspondences, num_correspondences, num_inliers_by_motion,
-         params_by_motion, num_motions);
-
-  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
-  for (i = 0; i < num_motions; ++i) {
-    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences ||
-        num_correspondences == 0) {
-      num_inliers_by_motion[i] = 0;
-    } else if (!get_inliers_from_indices(&params_by_motion[i],
-                                         correspondences)) {
-      free(correspondences);
-      return 0;
-    }
-  }
-
-  free(correspondences);
-
-  // Return true if any one of the motions has inliers.
-  for (i = 0; i < num_motions; ++i) {
-    if (num_inliers_by_motion[i] > 0) return 1;
-  }
-  return 0;
-}
-
-// Don't use points around the frame border since they are less reliable
-static INLINE int valid_point(int x, int y, int width, int height) {
-  return (x > (PATCH_SIZE + PATCH_CENTER)) &&
-         (x < (width - PATCH_SIZE - PATCH_CENTER)) &&
-         (y > (PATCH_SIZE + PATCH_CENTER)) &&
-         (y < (height - PATCH_SIZE - PATCH_CENTER));
-}
-
-static int determine_disflow_correspondence(int *frm_corners,
-                                            int num_frm_corners, double *flow_u,
-                                            double *flow_v, int width,
-                                            int height, int stride,
-                                            double *correspondences) {
-  int num_correspondences = 0;
-  int x, y;
-  for (int i = 0; i < num_frm_corners; ++i) {
-    x = frm_corners[2 * i];
-    y = frm_corners[2 * i + 1];
-    if (valid_point(x, y, width, height)) {
-      correspondences[4 * num_correspondences] = x;
-      correspondences[4 * num_correspondences + 1] = y;
-      correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x];
-      correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x];
-      num_correspondences++;
-    }
-  }
-  return num_correspondences;
-}
-
-static double getCubicValue(double p[4], double x) {
-  return p[1] + 0.5 * x *
-                    (p[2] - p[0] +
-                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
-                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
-}
-
-static void get_subcolumn(unsigned char *ref, double col[4], int stride, int x,
-                          int y_start) {
-  int i;
-  for (i = 0; i < 4; ++i) {
-    col[i] = ref[(i + y_start) * stride + x];
-  }
-}
-
-static double bicubic(unsigned char *ref, double x, double y, int stride) {
-  double arr[4];
-  int k;
-  int i = (int)x;
-  int j = (int)y;
-  for (k = 0; k < 4; ++k) {
-    double arr_temp[4];
-    get_subcolumn(ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] = getCubicValue(arr_temp, y - j);
-  }
-  return getCubicValue(arr, x - i);
-}
-
-// Interpolate a warped block using bicubic interpolation when possible
-static unsigned char interpolate(unsigned char *ref, double x, double y,
-                                 int width, int height, int stride) {
-  if (x < 0 && y < 0)
-    return ref[0];
-  else if (x < 0 && y > height - 1)
-    return ref[(height - 1) * stride];
-  else if (x > width - 1 && y < 0)
-    return ref[width - 1];
-  else if (x > width - 1 && y > height - 1)
-    return ref[(height - 1) * stride + (width - 1)];
-  else if (x < 0) {
-    int v;
-    int i = (int)y;
-    double a = y - i;
-    if (y > 1 && y < height - 2) {
-      double arr[4];
-      get_subcolumn(ref, arr, stride, 0, i - 1);
-      return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255);
-    }
-    v = (int)(ref[i * stride] * (1 - a) + ref[(i + 1) * stride] * a + 0.5);
-    return clamp(v, 0, 255);
-  } else if (y < 0) {
-    int v;
-    int j = (int)x;
-    double b = x - j;
-    if (x > 1 && x < width - 2) {
-      double arr[4] = { ref[j - 1], ref[j], ref[j + 1], ref[j + 2] };
-      return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255);
-    }
-    v = (int)(ref[j] * (1 - b) + ref[j + 1] * b + 0.5);
-    return clamp(v, 0, 255);
-  } else if (x > width - 1) {
-    int v;
-    int i = (int)y;
-    double a = y - i;
-    if (y > 1 && y < height - 2) {
-      double arr[4];
-      get_subcolumn(ref, arr, stride, width - 1, i - 1);
-      return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255);
-    }
-    v = (int)(ref[i * stride + width - 1] * (1 - a) +
-              ref[(i + 1) * stride + width - 1] * a + 0.5);
-    return clamp(v, 0, 255);
-  } else if (y > height - 1) {
-    int v;
-    int j = (int)x;
-    double b = x - j;
-    if (x > 1 && x < width - 2) {
-      int row = (height - 1) * stride;
-      double arr[4] = { ref[row + j - 1], ref[row + j], ref[row + j + 1],
-                        ref[row + j + 2] };
-      return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255);
-    }
-    v = (int)(ref[(height - 1) * stride + j] * (1 - b) +
-              ref[(height - 1) * stride + j + 1] * b + 0.5);
-    return clamp(v, 0, 255);
-  } else if (x > 1 && y > 1 && x < width - 2 && y < height - 2) {
-    return clamp((int)(bicubic(ref, x, y, stride) + 0.5), 0, 255);
-  } else {
-    int i = (int)y;
-    int j = (int)x;
-    double a = y - i;
-    double b = x - j;
-    int v = (int)(ref[i * stride + j] * (1 - a) * (1 - b) +
-                  ref[i * stride + j + 1] * (1 - a) * b +
-                  ref[(i + 1) * stride + j] * a * (1 - b) +
-                  ref[(i + 1) * stride + j + 1] * a * b);
-    return clamp(v, 0, 255);
-  }
-}
-
-// Warps a block using flow vector [u, v] and computes the mse
-static double compute_warp_and_error(unsigned char *ref, unsigned char *frm,
-                                     int width, int height, int stride, int x,
-                                     int y, double u, double v, int16_t *dt) {
-  int i, j;
-  unsigned char warped;
-  double x_w, y_w;
-  double mse = 0;
-  int16_t err = 0;
-  for (i = y; i < y + PATCH_SIZE; ++i)
-    for (j = x; j < x + PATCH_SIZE; ++j) {
-      x_w = (double)j + u;
-      y_w = (double)i + v;
-      warped = interpolate(ref, x_w, y_w, width, height, stride);
-      err = warped - frm[j + i * stride];
-      mse += err * err;
-      dt[(i - y) * PATCH_SIZE + (j - x)] = err;
-    }
-
-  mse /= (PATCH_SIZE * PATCH_SIZE);
-  return mse;
-}
-
-// Computes the components of the system of equations used to solve for
-// a flow vector. This includes:
-// 1.) The hessian matrix for optical flow. This matrix is in the
-// form of:
-//
-//       M = |sum(dx * dx)  sum(dx * dy)|
-//           |sum(dx * dy)  sum(dy * dy)|
-//
-// 2.)   b = |sum(dx * dt)|
-//           |sum(dy * dt)|
-// Where the sums are computed over a square window of PATCH_SIZE.
-static INLINE void compute_flow_system(const double *dx, int dx_stride,
-                                       const double *dy, int dy_stride,
-                                       const int16_t *dt, int dt_stride,
-                                       double *M, double *b) {
-  for (int i = 0; i < PATCH_SIZE; i++) {
-    for (int j = 0; j < PATCH_SIZE; j++) {
-      M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
-      M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
-      M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
-
-      b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
-      b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
-    }
-  }
-
-  M[2] = M[1];
-}
-
-// Solves a general Mx = b where M is a 2x2 matrix and b is a 2x1 matrix
-static INLINE void solve_2x2_system(const double *M, const double *b,
-                                    double *output_vec) {
-  double M_0 = M[0];
-  double M_3 = M[3];
-  double det = (M_0 * M_3) - (M[1] * M[2]);
-  if (det < 1e-5) {
-    // Handle singular matrix
-    // TODO(sarahparker) compare results using pseudo inverse instead
-    M_0 += 1e-10;
-    M_3 += 1e-10;
-    det = (M_0 * M_3) - (M[1] * M[2]);
-  }
-  const double det_inv = 1 / det;
-  const double mult_b0 = det_inv * b[0];
-  const double mult_b1 = det_inv * b[1];
-  output_vec[0] = M_3 * mult_b0 - M[1] * mult_b1;
-  output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
-}
-
-/*
-static INLINE void image_difference(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    int16_t *dst, int dst_stride, int height,
-                                    int width) {
-  const int block_unit = 8;
-  // Take difference in 8x8 blocks to make use of optimized diff function
-  for (int i = 0; i < height; i += block_unit) {
-    for (int j = 0; j < width; j += block_unit) {
-      aom_subtract_block(block_unit, block_unit, dst + i * dst_stride + j,
-                         dst_stride, src + i * src_stride + j, src_stride,
-                         ref + i * ref_stride + j, ref_stride);
-    }
-  }
-}
-*/
-
-// Compute an image gradient using a sobel filter.
-// If dir == 1, compute the x gradient. If dir == 0, compute y. This function
-// assumes the images have been padded so that they can be processed in units
-// of 8.
-static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
-                                           double *dst, int dst_stride,
-                                           int height, int width, int dir) {
-  double norm = 1.0;
-  // TODO(sarahparker) experiment with doing this over larger block sizes
-  const int block_unit = 8;
-  // Filter in 8x8 blocks to eventually make use of optimized convolve function
-  for (int i = 0; i < height; i += block_unit) {
-    for (int j = 0; j < width; j += block_unit) {
-      av1_convolve_2d_sobel_y_c(src + i * src_stride + j, src_stride,
-                                dst + i * dst_stride + j, dst_stride,
-                                block_unit, block_unit, dir, norm);
-    }
-  }
-}
-
-static void free_pyramid(ImagePyramid *pyr) {
-  aom_free(pyr->level_buffer);
-  if (pyr->has_gradient) {
-    aom_free(pyr->level_dx_buffer);
-    aom_free(pyr->level_dy_buffer);
-  }
-  aom_free(pyr);
-}
-
-static ImagePyramid *alloc_pyramid(int width, int height, int pad_size,
-                                   int compute_gradient) {
-  ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr));
-  if (!pyr) return NULL;
-  pyr->has_gradient = compute_gradient;
-  // 2 * width * height is the upper bound for a buffer that fits
-  // all pyramid levels + padding for each level
-  const int buffer_size = sizeof(*pyr->level_buffer) * 2 * width * height +
-                          (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
-  pyr->level_buffer = aom_malloc(buffer_size);
-  if (!pyr->level_buffer) {
-    free_pyramid(pyr);
-    return NULL;
-  }
-  memset(pyr->level_buffer, 0, buffer_size);
-
-  if (compute_gradient) {
-    const int gradient_size =
-        sizeof(*pyr->level_dx_buffer) * 2 * width * height +
-        (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
-    pyr->level_dx_buffer = aom_calloc(1, gradient_size);
-    pyr->level_dy_buffer = aom_calloc(1, gradient_size);
-    if (!(pyr->level_dx_buffer && pyr->level_dy_buffer)) {
-      free_pyramid(pyr);
-      return NULL;
-    }
-  }
-  return pyr;
-}
-
-static INLINE void update_level_dims(ImagePyramid *frm_pyr, int level) {
-  frm_pyr->widths[level] = frm_pyr->widths[level - 1] >> 1;
-  frm_pyr->heights[level] = frm_pyr->heights[level - 1] >> 1;
-  frm_pyr->strides[level] = frm_pyr->widths[level] + 2 * frm_pyr->pad_size;
-  // Point the beginning of the next level buffer to the correct location inside
-  // the padded border
-  frm_pyr->level_loc[level] =
-      frm_pyr->level_loc[level - 1] +
-      frm_pyr->strides[level - 1] *
-          (2 * frm_pyr->pad_size + frm_pyr->heights[level - 1]);
-}
-
-// Compute coarse to fine pyramids for a frame
-static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
-                                  const int frm_height, const int frm_stride,
-                                  int n_levels, int pad_size, int compute_grad,
-                                  ImagePyramid *frm_pyr) {
-  int cur_width, cur_height, cur_stride, cur_loc;
-  assert((frm_width >> n_levels) > 0);
-  assert((frm_height >> n_levels) > 0);
-
-  // Initialize first level
-  frm_pyr->n_levels = n_levels;
-  frm_pyr->pad_size = pad_size;
-  frm_pyr->widths[0] = frm_width;
-  frm_pyr->heights[0] = frm_height;
-  frm_pyr->strides[0] = frm_width + 2 * frm_pyr->pad_size;
-  // Point the beginning of the level buffer to the location inside
-  // the padded border
-  frm_pyr->level_loc[0] =
-      frm_pyr->strides[0] * frm_pyr->pad_size + frm_pyr->pad_size;
-  // This essentially copies the original buffer into the pyramid buffer
-  // without the original padding
-  av1_resize_plane(frm, frm_height, frm_width, frm_stride,
-                   frm_pyr->level_buffer + frm_pyr->level_loc[0],
-                   frm_pyr->heights[0], frm_pyr->widths[0],
-                   frm_pyr->strides[0]);
-
-  if (compute_grad) {
-    cur_width = frm_pyr->widths[0];
-    cur_height = frm_pyr->heights[0];
-    cur_stride = frm_pyr->strides[0];
-    cur_loc = frm_pyr->level_loc[0];
-    assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
-           frm_pyr->level_dy_buffer != NULL);
-    // Computation x gradient
-    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
-                            frm_pyr->level_dx_buffer + cur_loc, cur_stride,
-                            cur_height, cur_width, 1);
-
-    // Computation y gradient
-    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
-                            frm_pyr->level_dy_buffer + cur_loc, cur_stride,
-                            cur_height, cur_width, 0);
-  }
-
-  // Start at the finest level and resize down to the coarsest level
-  for (int level = 1; level < n_levels; ++level) {
-    update_level_dims(frm_pyr, level);
-    cur_width = frm_pyr->widths[level];
-    cur_height = frm_pyr->heights[level];
-    cur_stride = frm_pyr->strides[level];
-    cur_loc = frm_pyr->level_loc[level];
-
-    av1_resize_plane(frm_pyr->level_buffer + frm_pyr->level_loc[level - 1],
-                     frm_pyr->heights[level - 1], frm_pyr->widths[level - 1],
-                     frm_pyr->strides[level - 1],
-                     frm_pyr->level_buffer + cur_loc, cur_height, cur_width,
-                     cur_stride);
-
-    if (compute_grad) {
-      assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
-             frm_pyr->level_dy_buffer != NULL);
-      // Computation x gradient
-      sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
-                              frm_pyr->level_dx_buffer + cur_loc, cur_stride,
-                              cur_height, cur_width, 1);
-
-      // Computation y gradient
-      sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
-                              frm_pyr->level_dy_buffer + cur_loc, cur_stride,
-                              cur_height, cur_width, 0);
-    }
-  }
-}
-
-static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
-                                         double *dx, double *dy, int x, int y,
-                                         int width, int height, int stride,
-                                         double *u, double *v) {
-  double M[4] = { 0 };
-  double b[2] = { 0 };
-  double tmp_output_vec[2] = { 0 };
-  double error = 0;
-  int16_t dt[PATCH_SIZE * PATCH_SIZE];
-  double o_u = *u;
-  double o_v = *v;
-
-  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
-    error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
-                                   *v, dt);
-    if (error <= DISFLOW_ERROR_TR) break;
-    compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b);
-    solve_2x2_system(M, b, tmp_output_vec);
-    *u += tmp_output_vec[0];
-    *v += tmp_output_vec[1];
-  }
-  if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) {
-    *u = o_u;
-    *v = o_v;
-  }
-}
-
-// make sure flow_u and flow_v start at 0
-static bool compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
-                               double *flow_u, double *flow_v) {
-  int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
-  double *u_upscale =
-      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
-  double *v_upscale =
-      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
-  if (!(u_upscale && v_upscale)) {
-    aom_free(u_upscale);
-    aom_free(v_upscale);
-    return false;
-  }
-
-  assert(frm_pyr->n_levels == ref_pyr->n_levels);
-
-  // Compute flow field from coarsest to finest level of the pyramid
-  for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
-    cur_width = frm_pyr->widths[level];
-    cur_height = frm_pyr->heights[level];
-    cur_stride = frm_pyr->strides[level];
-    cur_loc = frm_pyr->level_loc[level];
-
-    for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
-      for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
-        patch_loc = i * cur_stride + j;
-        patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
-        compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
-                              ref_pyr->level_buffer + cur_loc,
-                              frm_pyr->level_dx_buffer + cur_loc + patch_loc,
-                              frm_pyr->level_dy_buffer + cur_loc + patch_loc, j,
-                              i, cur_width, cur_height, cur_stride,
-                              flow_u + patch_center, flow_v + patch_center);
-      }
-    }
-    // TODO(sarahparker) Replace this with upscale function in resize.c
-    if (level > 0) {
-      int h_upscale = frm_pyr->heights[level - 1];
-      int w_upscale = frm_pyr->widths[level - 1];
-      int s_upscale = frm_pyr->strides[level - 1];
-      for (int i = 0; i < h_upscale; ++i) {
-        for (int j = 0; j < w_upscale; ++j) {
-          u_upscale[j + i * s_upscale] =
-              flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
-          v_upscale[j + i * s_upscale] =
-              flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
-        }
-      }
-      memcpy(flow_u, u_upscale,
-             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
-      memcpy(flow_v, v_upscale,
-             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
-    }
-  }
-  aom_free(u_upscale);
-  aom_free(v_upscale);
-  return true;
-}
-
-static int compute_global_motion_disflow_based(
-    TransformationType type, unsigned char *frm_buffer, int frm_width,
-    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
-    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
-    MotionModel *params_by_motion, int num_motions) {
-  unsigned char *ref_buffer = ref->y_buffer;
-  const int ref_width = ref->y_width;
-  const int ref_height = ref->y_height;
-  const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
-  int num_correspondences;
-  double *correspondences;
-  RansacFuncDouble ransac = av1_get_ransac_double_prec_type(type);
-  assert(frm_width == ref_width);
-  assert(frm_height == ref_height);
-
-  // Ensure the number of pyramid levels will work with the frame resolution
-  const int msb =
-      frm_width < frm_height ? get_msb(frm_width) : get_msb(frm_height);
-  const int n_levels = AOMMIN(msb, N_LEVELS);
-
-  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    ref_buffer = av1_downconvert_frame(ref, bit_depth);
-  }
-
-  // TODO(sarahparker) We will want to do the source pyramid computation
-  // outside of this function so it doesn't get recomputed for every
-  // reference. We also don't need to compute every pyramid level for the
-  // reference in advance, since lower levels can be overwritten once their
-  // flow field is computed and upscaled. I'll add these optimizations
-  // once the full implementation is working.
-  // Allocate frm image pyramids
-  int compute_gradient = 1;
-  ImagePyramid *frm_pyr =
-      alloc_pyramid(frm_width, frm_height, pad_size, compute_gradient);
-  if (!frm_pyr) return 0;
-  compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm_stride, n_levels,
-                        pad_size, compute_gradient, frm_pyr);
-  // Allocate ref image pyramids
-  compute_gradient = 0;
-  ImagePyramid *ref_pyr =
-      alloc_pyramid(ref_width, ref_height, pad_size, compute_gradient);
-  if (!ref_pyr) {
-    free_pyramid(frm_pyr);
-    return 0;
-  }
-  compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride,
-                        n_levels, pad_size, compute_gradient, ref_pyr);
-
-  int ret = 0;
-  double *flow_u =
-      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
-  double *flow_v =
-      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
-  if (!(flow_u && flow_v)) goto Error;
-
-  memset(flow_u, 0,
-         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
-  memset(flow_v, 0,
-         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
-
-  if (!compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v)) goto Error;
-
-  // find correspondences between the two images using the flow field
-  correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
-  if (!correspondences) goto Error;
-  num_correspondences = determine_disflow_correspondence(
-      frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height,
-      frm_pyr->strides[0], correspondences);
-  ransac(correspondences, num_correspondences, num_inliers_by_motion,
-         params_by_motion, num_motions);
-
-  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
-  for (int i = 0; i < num_motions; ++i) {
-    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
-      num_inliers_by_motion[i] = 0;
-    }
-  }
-
-  // Return true if any one of the motions has inliers.
-  for (int i = 0; i < num_motions; ++i) {
-    if (num_inliers_by_motion[i] > 0) {
-      ret = 1;
-      break;
-    }
-  }
-
-  aom_free(correspondences);
-Error:
-  free_pyramid(frm_pyr);
-  free_pyramid(ref_pyr);
-  aom_free(flow_u);
-  aom_free(flow_v);
-  return ret;
-}
-
-int av1_compute_global_motion(TransformationType type,
-                              unsigned char *src_buffer, int src_width,
-                              int src_height, int src_stride, int *src_corners,
-                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
-                              int bit_depth,
-                              GlobalMotionEstimationType gm_estimation_type,
-                              int *num_inliers_by_motion,
-                              MotionModel *params_by_motion, int num_motions) {
-  switch (gm_estimation_type) {
-    case GLOBAL_MOTION_FEATURE_BASED:
-      return compute_global_motion_feature_based(
-          type, src_buffer, src_width, src_height, src_stride, src_corners,
-          num_src_corners, ref, bit_depth, num_inliers_by_motion,
-          params_by_motion, num_motions);
-    case GLOBAL_MOTION_DISFLOW_BASED:
-      return compute_global_motion_disflow_based(
-          type, src_buffer, src_width, src_height, src_stride, src_corners,
-          num_src_corners, ref, bit_depth, num_inliers_by_motion,
-          params_by_motion, num_motions);
-    default: assert(0 && "Unknown global motion estimation type");
-  }
-  return 0;
-}
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index a70bfa8eb..4fa3253e1 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -13,34 +13,18 @@
 #define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
 #include "aom_scale/yv12config.h"
 #include "aom_util/aom_thread.h"
 
-#include "av1/common/mv.h"
-#include "av1/common/warped_motion.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define MAX_CORNERS 4096
 #define RANSAC_NUM_MOTIONS 1
 #define GM_REFINEMENT_COUNT 5
 #define MAX_DIRECTIONS 2
 
-typedef enum {
-  GLOBAL_MOTION_FEATURE_BASED,
-  GLOBAL_MOTION_DISFLOW_BASED,
-} GlobalMotionEstimationType;
-
-unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth);
-
-typedef struct {
-  double params[MAX_PARAMDIM - 1];
-  int *inliers;
-  int num_inliers;
-} MotionModel;
-
 // The structure holds a valid reference frame type and its temporal distance
 // from the source frame.
 typedef struct {
@@ -129,29 +113,6 @@ int64_t av1_refine_integerized_param(
     int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
     int64_t erroradv_threshold);
 
-/*
-  Computes "num_motions" candidate global motion parameters between two frames.
-  The array "params_by_motion" should be length 8 * "num_motions". The ordering
-  of each set of parameters is best described  by the homography:
-
-        [x'     (m2 m3 m0   [x
-    z .  y'  =   m4 m5 m1 *  y
-         1]      m6 m7 1)    1]
-
-  where m{i} represents the ith value in any given set of parameters.
-
-  "num_inliers" should be length "num_motions", and will be populated with the
-  number of inlier feature points for each motion. Params for which the
-  num_inliers entry is 0 should be ignored by the caller.
-*/
-int av1_compute_global_motion(TransformationType type,
-                              unsigned char *src_buffer, int src_width,
-                              int src_height, int src_stride, int *src_corners,
-                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
-                              int bit_depth,
-                              GlobalMotionEstimationType gm_estimation_type,
-                              int *num_inliers_by_motion,
-                              MotionModel *params_by_motion, int num_motions);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index 323b4e858..0df070a18 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c
@@ -11,7 +11,9 @@
 
 #include "aom_dsp/binary_codes_writer.h"
 
-#include "av1/encoder/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "av1/common/warped_motion.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/rdopt.h"
@@ -121,7 +123,7 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
       params_by_motion[i].num_inliers = 0;
     }
 
-    av1_compute_global_motion(model, src_buffer, src_width, src_height,
+    aom_compute_global_motion(model, src_buffer, src_width, src_height,
                               src_stride, src_corners, num_src_corners,
                               ref_buf[frame], cpi->common.seq_params->bit_depth,
                               gm_estimation_type, inliers_by_motion,
@@ -133,6 +135,16 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
       params_this_motion = params_by_motion[i].params;
       av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
 
+      // Work around a bug in the AV1 specification
+      //
+      // For TRANSLATION type global motion models, gm_get_motion_vector() gives
+      // the wrong motion vector (see comments in that function for details).
+      // As translation-type models do not give much gain, we can avoid this bug
+      // by never choosing a TRANSLATION type model
+      if (tmp_wm_params.wmtype == TRANSLATION) {
+        continue;
+      }
+
       if (tmp_wm_params.wmtype != IDENTITY) {
         av1_compute_feature_segmentation_map(
             segment_map, segment_map_w, segment_map_h,
@@ -154,6 +166,12 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
             GM_REFINEMENT_COUNT, best_warp_error, segment_map, segment_map_w,
             erroradv_threshold);
 
+        // av1_refine_integerized_param() can return a TRANSLATION type model
+        // even if its input is some other type, so we have to skip those too
+        if (tmp_wm_params.wmtype == TRANSLATION) {
+          continue;
+        }
+
         if (warp_error < best_warp_error) {
           best_warp_error = warp_error;
           // Save the wm_params modified by
@@ -168,6 +186,8 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
       if (!av1_get_shear_params(&cm->global_motion[frame]))
         cm->global_motion[frame] = default_warp_params;
 
+#if 0
+    // We never choose translational models, so this code is disabled
     if (cm->global_motion[frame].wmtype == TRANSLATION) {
       cm->global_motion[frame].wmmat[0] =
           convert_to_trans_prec(cm->features.allow_high_precision_mv,
@@ -178,6 +198,7 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
                                 cm->global_motion[frame].wmmat[1]) *
           GM_TRANS_ONLY_DECODE_FACTOR;
     }
+#endif
 
     if (cm->global_motion[frame].wmtype == IDENTITY) continue;
 
@@ -477,6 +498,7 @@ void av1_compute_global_motion_facade(AV1_COMP *cpi) {
   }
 
   if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+      cpi->superres_mode == AOM_SUPERRES_NONE &&
       cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done) {
     setup_global_motion_info_params(cpi);
     if (cpi->mt_info.num_workers > 1)
diff --git a/av1/encoder/gop_structure.c b/av1/encoder/gop_structure.c
index ab2008393..f1689dec6 100644
--- a/av1/encoder/gop_structure.c
+++ b/av1/encoder/gop_structure.c
@@ -705,8 +705,9 @@ static int construct_multi_layer_gf_structure(
       }
     } else {
       // Set layer depth threshold for reordering as per the gf length.
-      int depth_thr =
-          (actual_gf_length == 16) ? 3 : (actual_gf_length == 32) ? 4 : INT_MAX;
+      int depth_thr = (actual_gf_length == 16)   ? 3
+                      : (actual_gf_length == 32) ? 4
+                                                 : INT_MAX;
 
       set_multi_layer_params_for_fp(
           twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
@@ -824,10 +825,15 @@ void av1_gop_setup_structure(AV1_COMP *cpi) {
   const int key_frame = rc->frames_since_key == 0;
   FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE;
 
-  if (key_frame)
+  if (key_frame) {
     first_frame_update_type = KF_UPDATE;
-  else if (!cpi->ppi->gf_state.arf_gf_boost_lst)
+    if (cpi->oxcf.kf_max_pyr_height != -1) {
+      gf_group->max_layer_depth_allowed = AOMMIN(
+          cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed);
+    }
+  } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) {
     first_frame_update_type = GF_UPDATE;
+  }
 
   gf_group->size = construct_multi_layer_gf_structure(
       cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval,
diff --git a/av1/encoder/gop_structure.h b/av1/encoder/gop_structure.h
index eb20c8461..ff22f5413 100644
--- a/av1/encoder/gop_structure.h
+++ b/av1/encoder/gop_structure.h
@@ -38,7 +38,7 @@ struct EncodeFrameParams;
  *
  * \param[in]    cpi          Top - level encoder instance structure
  *
- * \return No return value but this function updates group data structures.
+ * \remark No return value but this function updates group data structures.
  */
 void av1_gop_setup_structure(struct AV1_COMP *cpi);
 
@@ -58,7 +58,7 @@ void av1_gop_setup_structure(struct AV1_COMP *cpi);
  *                            uni-directional group.
  * \param[in]   gf_group_bits Bits available to be allocated.
  *
- * \return No return but updates the rate control and group data structures
+ * \remark No return but updates the rate control and group data structures
  *         to reflect the allocation of bits.
  */
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index ab8e3c34e..d8639100f 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -228,7 +228,7 @@ static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x,
  */
 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
-                                    int64_t *distortion, int *skippable,
+                                    int64_t *distortion, uint8_t *skippable,
                                     BLOCK_SIZE bsize, int mode_cost,
                                     PREDICTION_MODE best_mode_so_far,
                                     int64_t *best_rd, int64_t *best_model_rd,
@@ -812,7 +812,7 @@ static bool should_prune_chroma_smooth_pred_based_on_source_variance(
 
 int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
-                                    int64_t *distortion, int *skippable,
+                                    int64_t *distortion, uint8_t *skippable,
                                     BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1137,7 +1137,8 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       BLOCK_SIZE bsize, const int *bmode_costs,
                                       int64_t *best_rd, int *rate,
                                       int *rate_tokenonly, int64_t *distortion,
-                                      int *skippable, MB_MODE_INFO *best_mbmi,
+                                      uint8_t *skippable,
+                                      MB_MODE_INFO *best_mbmi,
                                       PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -1179,7 +1180,7 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
  * \callergraph
  * This function loops through all filter_intra modes to find the best one.
  *
- * \return Returns nothing, but updates the mbmi and rd_stats.
+ * \remark Returns nothing, but updates the mbmi and rd_stats.
  */
 static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
                                             BLOCK_SIZE bsize,
@@ -1431,7 +1432,7 @@ static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost(
 // Finds the best non-intrabc mode on an intra frame.
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
-                                   int64_t *distortion, int *skippable,
+                                   int64_t *distortion, uint8_t *skippable,
                                    BLOCK_SIZE bsize, int64_t best_rd,
                                    PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index 0a6b76955..75289c4e3 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -62,7 +62,7 @@ typedef struct IntraModeSearchState {
   int rate_uv_intra;          /*!< \brief Total rate to transmit uv_mode */
   int rate_uv_tokenonly;      /*!< \brief Rate transmit txfm tokens */
   int64_t dist_uvs;           /*!< \brief Distortion of the uv_mode's recon */
-  int skip_uvs;               /*!< \brief Whether the uv txfm is skippable */
+  uint8_t skip_uvs;           /*!< \brief Whether the uv txfm is skippable */
   UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */
   PALETTE_MODE_INFO pmi_uv;   /*!< \brief Color map if mode_uv is palette */
   int8_t uv_angle_delta;      /*!< \brief Angle delta if mode_uv directional */
@@ -196,8 +196,6 @@ int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
  * \param[in]    this_rd_cost       Struct to keep track of palette mode's
  *                                  rd_stats.
  * \param[in]    best_rd            Best RD seen for this block so far.
- *
- * \return Returns nothing.
  */
 void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, unsigned int ref_frame_cost,
@@ -236,7 +234,7 @@ void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
  */
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
-                                   int64_t *distortion, int *skippable,
+                                   int64_t *distortion, uint8_t *skippable,
                                    BLOCK_SIZE bsize, int64_t best_rd,
                                    PICK_MODE_CONTEXT *ctx);
 
@@ -271,7 +269,7 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
  */
 int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
-                                    int64_t *distortion, int *skippable,
+                                    int64_t *distortion, uint8_t *skippable,
                                     BLOCK_SIZE bsize, TX_SIZE max_tx_size);
 
 /*! \brief Return the number of colors in src. Used by palette mode.
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index e794caf29..31ffdcff2 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "av1/common/blockd.h"
@@ -24,17 +25,27 @@
 #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
 #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
 
-static int RENAME(calc_dist)(const int *p1, const int *p2) {
+// Though we want to compute the smallest L2 norm, in 1 dimension,
+// it is equivalent to find the smallest L1 norm and then square it.
+// This is preferrable for speed, especially on the SIMD side.
+static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) {
+#if AV1_K_MEANS_DIM == 1
+  return abs(p1[0] - p2[0]);
+#else
   int dist = 0;
   for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
     const int diff = p1[i] - p2[i];
     dist += diff * diff;
   }
   return dist;
+#endif
 }
 
-void RENAME(av1_calc_indices)(const int *data, const int *centroids,
-                              uint8_t *indices, int n, int k) {
+void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids,
+                              uint8_t *indices, int64_t *dist, int n, int k) {
+  if (dist) {
+    *dist = 0;
+  }
   for (int i = 0; i < n; ++i) {
     int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
     indices[i] = 0;
@@ -46,23 +57,32 @@ void RENAME(av1_calc_indices)(const int *data, const int *centroids,
         indices[i] = j;
       }
     }
+    if (dist) {
+#if AV1_K_MEANS_DIM == 1
+      *dist += min_dist * min_dist;
+#else
+      *dist += min_dist;
+#endif
+    }
   }
 }
 
-static void RENAME(calc_centroids)(const int *data, int *centroids,
+static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids,
                                    const uint8_t *indices, int n, int k) {
   int i, j;
   int count[PALETTE_MAX_SIZE] = { 0 };
+  int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
   unsigned int rand_state = (unsigned int)data[0];
   assert(n <= 32768);
-  memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+  memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM);
 
   for (i = 0; i < n; ++i) {
     const int index = indices[i];
     assert(index < k);
     ++count[index];
     for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
-      centroids[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j];
+      centroids_sum[index * AV1_K_MEANS_DIM + j] +=
+          data[i * AV1_K_MEANS_DIM + j];
     }
   }
 
@@ -74,61 +94,57 @@ static void RENAME(calc_centroids)(const int *data, int *centroids,
     } else {
       for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
         centroids[i * AV1_K_MEANS_DIM + j] =
-            DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+            DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]);
       }
     }
   }
 }
 
-static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
-                                       const uint8_t *indices, int n, int k) {
-  int64_t dist = 0;
-  (void)k;
-  for (int i = 0; i < n; ++i) {
-    dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
-                              centroids + indices[i] * AV1_K_MEANS_DIM);
-  }
-  return dist;
-}
-
-void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
-                         int n, int k, int max_itr) {
-  int pre_centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t pre_indices[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
+void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids,
+                         uint8_t *indices, int n, int k, int max_itr) {
+  int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+  uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
+  int16_t *meta_centroids[2] = { centroids, centroids_tmp };
+  uint8_t *meta_indices[2] = { indices, indices_tmp };
+  int i, l = 0, prev_l, best_l = 0;
+  int64_t this_dist;
 
   assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT);
 
-#if AV1_K_MEANS_DIM - 2
-  av1_calc_indices_dim1(data, centroids, indices, n, k);
+#if AV1_K_MEANS_DIM == 1
+  av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k);
 #else
-  av1_calc_indices_dim2(data, centroids, indices, n, k);
+  av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k);
 #endif
-  int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
-  for (int i = 0; i < max_itr; ++i) {
-    const int64_t pre_dist = this_dist;
-    memcpy(pre_centroids, centroids,
-           sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
-    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+  for (i = 0; i < max_itr; ++i) {
+    const int64_t prev_dist = this_dist;
+    prev_l = l;
+    l = (l == 1) ? 0 : 1;
 
-    RENAME(calc_centroids)(data, centroids, indices, n, k);
-#if AV1_K_MEANS_DIM - 2
-    av1_calc_indices_dim1(data, centroids, indices, n, k);
+    RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k);
+#if AV1_K_MEANS_DIM == 1
+    av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist,
+                          n, k);
 #else
-    av1_calc_indices_dim2(data, centroids, indices, n, k);
+    av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist,
+                          n, k);
 #endif
-    this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
-    if (this_dist > pre_dist) {
-      memcpy(centroids, pre_centroids,
-             sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
-      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+    if (this_dist > prev_dist) {
+      best_l = prev_l;
       break;
     }
-    if (!memcmp(centroids, pre_centroids,
-                sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM))
+    if (!memcmp(meta_centroids[l], meta_centroids[prev_l],
+                sizeof(centroids[0]) * k * AV1_K_MEANS_DIM))
       break;
   }
+  if (i == max_itr) best_l = l;
+  if (best_l != 0) {
+    memcpy(centroids, meta_centroids[1],
+           sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+    memcpy(indices, meta_indices[1], sizeof(indices[0]) * n);
+  }
 }
 #undef RENAME_
 #undef RENAME
diff --git a/av1/encoder/level.c b/av1/encoder/level.c
index eab472865..4d1714278 100644
--- a/av1/encoder/level.c
+++ b/av1/encoder/level.c
@@ -209,10 +209,110 @@ static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
     .high_cr = 4.0,
     .max_tiles = 128,
     .max_tile_cols = 16 },
-  UNDEFINED_LEVEL,
-  UNDEFINED_LEVEL,
-  UNDEFINED_LEVEL,
-  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_7_0,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4706009088L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_7_1,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 8556380160L,
+    .max_decode_rate = 8758886400L,
+    .max_header_rate = 300,
+    .main_mbps = 200.0,
+    .high_mbps = 960.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_7_2,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 17112760320L,
+    .max_decode_rate = 17517772800L,
+    .max_header_rate = 300,
+    .main_mbps = 320.0,
+    .high_mbps = 1600.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_7_3,
+    .max_picture_size = 142606336,
+    .max_h_size = 32768,
+    .max_v_size = 17408,
+    .max_display_rate = 17112760320L,
+    .max_decode_rate = 18824036352L,
+    .max_header_rate = 300,
+    .main_mbps = 320.0,
+    .high_mbps = 1600.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 256,
+    .max_tile_cols = 32 },
+  { .level = SEQ_LEVEL_8_0,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 17112760320L,
+    .max_decode_rate = 18824036352L,
+    .max_header_rate = 300,
+    .main_mbps = 320.0,
+    .high_mbps = 1600.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+  { .level = SEQ_LEVEL_8_1,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 34225520640L,
+    .max_decode_rate = 34910031052L,
+    .max_header_rate = 300,
+    .main_mbps = 400.0,
+    .high_mbps = 1920.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+  { .level = SEQ_LEVEL_8_2,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 68451041280L,
+    .max_decode_rate = 69820062105L,
+    .max_header_rate = 300,
+    .main_mbps = 640.0,
+    .high_mbps = 3200.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
+  { .level = SEQ_LEVEL_8_3,
+    .max_picture_size = 530841600,
+    .max_h_size = 65536,
+    .max_v_size = 34816,
+    .max_display_rate = 68451041280L,
+    .max_decode_rate = 75296145408L,
+    .max_header_rate = 300,
+    .main_mbps = 640.0,
+    .high_mbps = 3200.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 512,
+    .max_tile_cols = 64 },
 };
 
 typedef enum {
@@ -846,7 +946,6 @@ static void get_temporal_parallel_params(int scalability_mode_idc,
   }
 }
 
-#define MAX_TILE_SIZE (4096 * 2304)
 #define MIN_CROPPED_TILE_WIDTH 8
 #define MIN_CROPPED_TILE_HEIGHT 8
 #define MIN_FRAME_WIDTH 16
@@ -917,7 +1016,10 @@ static TARGET_LEVEL_FAIL_ID check_level_constraints(
       break;
     }
 
-    if (level_stats->max_tile_size > MAX_TILE_SIZE) {
+    const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3)
+                                  ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE
+                                  : MAX_TILE_AREA;
+    if (level_stats->max_tile_size > max_tile_size) {
       fail_id = TILE_TOO_LARGE;
       break;
     }
diff --git a/av1/encoder/lookahead.c b/av1/encoder/lookahead.c
index a9bccb1c6..10fbb77cb 100644
--- a/av1/encoder/lookahead.c
+++ b/av1/encoder/lookahead.c
@@ -155,7 +155,10 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
   buf->flags = flags;
   ++ctx->push_frame_count;
   aom_remove_metadata_from_frame_buffer(&buf->img);
-  aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata);
+  if (src->metadata &&
+      aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) {
+    return 1;
+  }
   return 0;
 }
 
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 395e35079..f8f8d8d30 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -78,8 +78,9 @@ get_faster_search_method(SEARCH_METHODS search_method) {
     case SQUARE: return HEX;
     case HEX: return FAST_HEX;
     case FAST_HEX: return FAST_HEX;
-    case FAST_DIAMOND: return FAST_DIAMOND;
+    case FAST_DIAMOND: return VFAST_DIAMOND;
     case FAST_BIGDIA: return FAST_BIGDIA;
+    case VFAST_DIAMOND: return VFAST_DIAMOND;
     default: assert(0 && "Invalid search method!"); return DIAMOND;
   }
 }
@@ -93,7 +94,7 @@ void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
-    const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
     int fine_search_interval) {
   const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
@@ -105,10 +106,24 @@ void av1_make_default_fullpel_ms_params(
   init_ms_buffers(&ms_params->ms_buffers, x);
 
   SEARCH_METHODS search_method = mv_sf->search_method;
-  if (mv_sf->use_bsize_dependent_search_method) {
-    const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
-    if (min_dim >= 32) {
-      search_method = get_faster_search_method(search_method);
+  const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
+  const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+  const int qband = x->qindex >> (QINDEX_BITS - 2);
+  const bool use_faster_search_method =
+      (sf_blk_search_method == 1 && min_dim >= 32) ||
+      (sf_blk_search_method >= 2 && min_dim >= 16 &&
+       x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
+
+  if (use_faster_search_method) {
+    search_method = get_faster_search_method(search_method);
+
+    // We might need to update the search site config since search_method
+    // is changed here.
+    const int ref_stride = ms_params->ms_buffers.ref->stride;
+    if (ref_stride != search_sites[search_method].stride) {
+      av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+                                     ref_stride);
+      search_sites = x->search_site_cfg_buf;
     }
   }
 
@@ -119,9 +134,12 @@ void av1_make_default_fullpel_ms_params(
   if (use_downsampled_sad) {
     ms_params->sdf = ms_params->vfp->sdsf;
     ms_params->sdx4df = ms_params->vfp->sdsx4df;
+    // Skip version of sadx3 is not is not available yet
+    ms_params->sdx3df = ms_params->vfp->sdsx4df;
   } else {
     ms_params->sdf = ms_params->vfp->sdf;
     ms_params->sdx4df = ms_params->vfp->sdx4df;
+    ms_params->sdx3df = ms_params->vfp->sdx3df;
   }
 
   ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
@@ -185,15 +203,6 @@ void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
   init_ms_buffers(ms_buffers, x);
 }
 
-static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
-  return mv->row * stride + mv->col;
-}
-
-static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
-                                                 const FULLPEL_MV *mv) {
-  return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
-}
-
 void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
   int col_min =
       GET_MV_RAWPEL(mv->col) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
@@ -837,11 +846,13 @@ static AOM_FORCE_INLINE void calc_int_sad_list(
 //   If the current sad is lower than the current best sad.
 // Returns:
 //   Whether the input sad (mv) is better than the current best.
-static int update_mvs_and_sad(const unsigned int this_sad, const FULLPEL_MV *mv,
-                              const MV_COST_PARAMS *mv_cost_params,
-                              unsigned int *best_sad,
-                              unsigned int *raw_best_sad, FULLPEL_MV *best_mv,
-                              FULLPEL_MV *second_best_mv) {
+static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad,
+                                         const FULLPEL_MV *mv,
+                                         const MV_COST_PARAMS *mv_cost_params,
+                                         unsigned int *best_sad,
+                                         unsigned int *raw_best_sad,
+                                         FULLPEL_MV *best_mv,
+                                         FULLPEL_MV *second_best_mv) {
   if (this_sad >= *best_sad) return 0;
 
   // Add the motion vector cost.
@@ -858,33 +869,36 @@ static int update_mvs_and_sad(const unsigned int this_sad, const FULLPEL_MV *mv,
 
 // Calculate sad4 and update the bestmv information
 // in FAST_DIAMOND search method.
-static void calc_sad4_update_bestmv(
+static AOM_INLINE void calc_sad4_update_bestmv(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
-    FULLPEL_MV *temp_best_mv, unsigned int *bestsad, unsigned int *raw_bestsad,
-    int search_step, int *best_site, int cand_start) {
+    const FULLPEL_MV center_mv, const uint8_t *center_address,
+    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+    int *best_site, int cand_start, int *cost_list) {
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   const search_site *site = ms_params->search_sites->site[search_step];
 
   unsigned char const *block_offset[4];
-  unsigned int sads[4];
-  const uint8_t *best_address;
+  unsigned int sads_buf[4];
+  unsigned int *sads;
   const uint8_t *src_buf = src->buf;
   const int src_stride = src->stride;
-  best_address = get_buf_from_fullmv(ref, temp_best_mv);
+  if (cost_list) {
+    sads = (unsigned int *)(cost_list + 1);
+  } else {
+    sads = sads_buf;
+  }
   // Loop over number of candidates.
   for (int j = 0; j < 4; j++)
-    block_offset[j] = site[cand_start + j].offset + best_address;
+    block_offset[j] = site[cand_start + j].offset + center_address;
 
   // 4-point sad calculation.
   ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
 
   for (int j = 0; j < 4; j++) {
-    const FULLPEL_MV this_mv = {
-      temp_best_mv->row + site[cand_start + j].mv.row,
-      temp_best_mv->col + site[cand_start + j].mv.col
-    };
+    const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row,
+                                 center_mv.col + site[cand_start + j].mv.col };
     const int found_better_mv = update_mvs_and_sad(
         sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
         /*second_best_mv=*/NULL);
@@ -892,23 +906,94 @@ static void calc_sad4_update_bestmv(
   }
 }
 
+static AOM_INLINE void calc_sad3_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad,
+    unsigned int *raw_bestsad, int search_step, int *best_site,
+    const int *chkpts_indices, int *cost_list) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  unsigned char const *block_offset[4] = {
+    center_address + site[chkpts_indices[0]].offset,
+    center_address + site[chkpts_indices[1]].offset,
+    center_address + site[chkpts_indices[2]].offset,
+    center_address,
+  };
+  unsigned int sads[4];
+  ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
+  for (int j = 0; j < 3; j++) {
+    const int index = chkpts_indices[j];
+    const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+                                 center_mv.col + site[index].mv.col };
+    const int found_better_mv = update_mvs_and_sad(
+        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = j;
+  }
+  if (cost_list) {
+    for (int j = 0; j < 3; j++) {
+      int index = chkpts_indices[j];
+      cost_list[index + 1] = sads[j];
+    }
+  }
+}
+
 // Calculate sad and update the bestmv information
 // in FAST_DIAMOND search method.
-static void calc_sad_update_bestmv(
+static AOM_INLINE void calc_sad_update_bestmv(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
-    FULLPEL_MV *temp_best_mv, unsigned int *bestsad, unsigned int *raw_bestsad,
-    int search_step, int *best_site, const int num_candidates, int cand_start) {
+    const FULLPEL_MV center_mv, const uint8_t *center_address,
+    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+    int *best_site, const int num_candidates, int cand_start, int *cost_list) {
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   const search_site *site = ms_params->search_sites->site[search_step];
   // Loop over number of candidates.
   for (int i = cand_start; i < num_candidates; i++) {
-    const FULLPEL_MV this_mv = { temp_best_mv->row + site[i].mv.row,
-                                 temp_best_mv->col + site[i].mv.col };
+    const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row,
+                                 center_mv.col + site[i].mv.col };
     if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
-    int thissad = get_mvpred_sad(
-        ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref->stride);
+    int thissad = get_mvpred_sad(ms_params, src,
+                                 center_address + site[i].offset, ref->stride);
+    if (cost_list) {
+      cost_list[i + 1] = thissad;
+    }
+    const int found_better_mv = update_mvs_and_sad(
+        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = i;
+  }
+}
+
+static AOM_INLINE void calc_sad_update_bestmv_with_indices(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    const FULLPEL_MV center_mv, const uint8_t *center_address,
+    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+    int *best_site, const int num_candidates, const int *chkpts_indices,
+    int *cost_list) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  // Loop over number of candidates.
+  for (int i = 0; i < num_candidates; i++) {
+    int index = chkpts_indices[i];
+    const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+                                 center_mv.col + site[index].mv.col };
+    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+      if (cost_list) {
+        cost_list[index + 1] = INT_MAX;
+      }
+      continue;
+    }
+    const int thissad = get_mvpred_sad(
+        ms_params, src, center_address + site[index].offset, ref->stride);
+    if (cost_list) {
+      cost_list[index + 1] = thissad;
+    }
     const int found_better_mv = update_mvs_and_sad(
         thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
         /*second_best_mv=*/NULL);
@@ -937,7 +1022,6 @@ static int pattern_search(FULLPEL_MV start_mv,
   const int last_is_4 = num_candidates[0] == 4;
   int br, bc;
   unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
-  int thissad;
   int k = -1;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
@@ -961,32 +1045,31 @@ static int pattern_search(FULLPEL_MV start_mv,
   // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
+  const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv);
   if (do_init_search) {
     s = best_init_s;
     best_init_s = -1;
     for (t = 0; t <= s; ++t) {
       int best_site = -1;
-      FULLPEL_MV temp_best_mv;
-      temp_best_mv.row = br;
-      temp_best_mv.col = bc;
+      FULLPEL_MV center_mv = { br, bc };
       if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
         // Call 4-point sad for multiples of 4 candidates.
         const int no_of_4_cand_loops = num_candidates[t] >> 2;
         for (i = 0; i < no_of_4_cand_loops; i++) {
-          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
-                                  &temp_best_mv, &bestsad, &raw_bestsad, t,
-                                  &best_site, i * 4);
+          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, t,
+                                  &best_site, i * 4, /*cost_list=*/NULL);
         }
         // Rest of the candidates
         const int remaining_cand = num_candidates[t] % 4;
-        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
-                               &temp_best_mv, &bestsad, &raw_bestsad, t,
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                               center_address, &bestsad, &raw_bestsad, t,
                                &best_site, remaining_cand,
-                               no_of_4_cand_loops * 4);
+                               no_of_4_cand_loops * 4, NULL);
       } else {
-        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
-                               &temp_best_mv, &bestsad, &raw_bestsad, t,
-                               &best_site, num_candidates[t], 0);
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                               center_address, &bestsad, &raw_bestsad, t,
+                               &best_site, num_candidates[t], 0, NULL);
       }
       if (best_site == -1) {
         continue;
@@ -998,6 +1081,7 @@ static int pattern_search(FULLPEL_MV start_mv,
     if (best_init_s != -1) {
       br += search_sites->site[best_init_s][k].mv.row;
       bc += search_sites->site[best_init_s][k].mv.col;
+      center_address += search_sites->site[best_init_s][k].offset;
     }
   }
 
@@ -1011,27 +1095,26 @@ static int pattern_search(FULLPEL_MV start_mv,
     for (; s >= last_s; s--) {
       // No need to search all points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
-        FULLPEL_MV temp_best_mv;
-        temp_best_mv.row = br;
-        temp_best_mv.col = bc;
+        FULLPEL_MV center_mv = { br, bc };
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           // Call 4-point sad for multiples of 4 candidates.
           const int no_of_4_cand_loops = num_candidates[s] >> 2;
           for (i = 0; i < no_of_4_cand_loops; i++) {
             calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
-                                    &temp_best_mv, &bestsad, &raw_bestsad, s,
-                                    &best_site, i * 4);
+                                    center_mv, center_address, &bestsad,
+                                    &raw_bestsad, s, &best_site, i * 4,
+                                    /*cost_list=*/NULL);
           }
           // Rest of the candidates
           const int remaining_cand = num_candidates[s] % 4;
-          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
-                                 &temp_best_mv, &bestsad, &raw_bestsad, s,
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                 center_address, &bestsad, &raw_bestsad, s,
                                  &best_site, remaining_cand,
-                                 no_of_4_cand_loops * 4);
+                                 no_of_4_cand_loops * 4, NULL);
         } else {
-          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
-                                 &temp_best_mv, &bestsad, &raw_bestsad, s,
-                                 &best_site, num_candidates[s], 0);
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                 center_address, &bestsad, &raw_bestsad, s,
+                                 &best_site, num_candidates[s], 0, NULL);
         }
 
         if (best_site == -1) {
@@ -1039,6 +1122,7 @@ static int pattern_search(FULLPEL_MV start_mv,
         } else {
           br += search_sites->site[s][best_site].mv.row;
           bc += search_sites->site[s][best_site].mv.col;
+          center_address += search_sites->site[s][best_site].offset;
           k = best_site;
         }
       }
@@ -1050,82 +1134,48 @@ static int pattern_search(FULLPEL_MV start_mv,
         next_chkpts_indices[1] = k;
         next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
 
+        FULLPEL_MV center_mv = { br, bc };
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const FULLPEL_MV this_mv = {
-              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
-              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
-            };
-            thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            const int found_better_mv =
-                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
-                                   &raw_bestsad, best_mv,
-                                   /*second_best_mv=*/NULL);
-            if (found_better_mv) best_site = i;
-          }
+          calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, s,
+                                  &best_site, next_chkpts_indices, NULL);
         } else {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const FULLPEL_MV this_mv = {
-              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
-              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
-            };
-            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
-              continue;
-            thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            const int found_better_mv =
-                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
-                                   &raw_bestsad, best_mv,
-                                   /*second_best_mv=*/NULL);
-            if (found_better_mv) best_site = i;
-          }
+          calc_sad_update_bestmv_with_indices(
+              ms_params, mv_cost_params, best_mv, center_mv, center_address,
+              &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+              next_chkpts_indices, NULL);
         }
 
         if (best_site != -1) {
           k = next_chkpts_indices[best_site];
           br += search_sites->site[s][k].mv.row;
           bc += search_sites->site[s][k].mv.col;
+          center_address += search_sites->site[s][k].offset;
         }
       } while (best_site != -1);
     }
-
     // Note: If we enter the if below, then cost_list must be non-NULL.
     if (s == 0) {
       cost_list[0] = raw_bestsad;
       costlist_has_sad = 1;
+      assert(num_candidates[s] == 4);
       if (!do_init_search || s != best_init_s) {
+        FULLPEL_MV center_mv = { br, bc };
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + search_sites->site[s][i].mv.row,
-                                         bc + search_sites->site[s][i].mv.col };
-            cost_list[i + 1] = thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            const int found_better_mv =
-                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
-                                   &raw_bestsad, best_mv,
-                                   /*second_best_mv=*/NULL);
-            if (found_better_mv) best_site = i;
-          }
+          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, s,
+                                  &best_site, 0, cost_list);
         } else {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + search_sites->site[s][i].mv.row,
-                                         bc + search_sites->site[s][i].mv.col };
-            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
-              continue;
-            cost_list[i + 1] = thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            const int found_better_mv =
-                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
-                                   &raw_bestsad, best_mv,
-                                   /*second_best_mv=*/NULL);
-            if (found_better_mv) best_site = i;
-          }
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                 center_address, &bestsad, &raw_bestsad, s,
+                                 &best_site, /*num_candidates=*/4,
+                                 /*cand_start=*/0, cost_list);
         }
 
         if (best_site != -1) {
           br += search_sites->site[s][best_site].mv.row;
           bc += search_sites->site[s][best_site].mv.col;
+          center_address += search_sites->site[s][best_site].offset;
           k = best_site;
         }
       }
@@ -1139,52 +1189,34 @@ static int pattern_search(FULLPEL_MV start_mv,
         cost_list[((k + 2) % 4) + 1] = cost_list[0];
         cost_list[0] = raw_bestsad;
 
+        FULLPEL_MV center_mv = { br, bc };
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const FULLPEL_MV this_mv = {
-              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
-              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
-            };
-            cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            const int found_better_mv =
-                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
-                                   &raw_bestsad, best_mv,
-                                   /*second_best_mv=*/NULL);
-            if (found_better_mv) best_site = i;
-          }
+          assert(PATTERN_CANDIDATES_REF == 3);
+          calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+                                  center_address, &bestsad, &raw_bestsad, s,
+                                  &best_site, next_chkpts_indices, cost_list);
         } else {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const FULLPEL_MV this_mv = {
-              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
-              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
-            };
-            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
-              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
-              continue;
-            }
-            cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            const int found_better_mv =
-                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
-                                   &raw_bestsad, best_mv,
-                                   /*second_best_mv=*/NULL);
-            if (found_better_mv) best_site = i;
-          }
+          calc_sad_update_bestmv_with_indices(
+              ms_params, mv_cost_params, best_mv, center_mv, center_address,
+              &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+              next_chkpts_indices, cost_list);
         }
 
         if (best_site != -1) {
           k = next_chkpts_indices[best_site];
           br += search_sites->site[s][k].mv.row;
           bc += search_sites->site[s][k].mv.col;
+          center_address += search_sites->site[s][k].offset;
         }
       }
     }
   }
-
   best_mv->row = br;
   best_mv->col = bc;
 
+  assert(center_address == get_buf_from_fullmv(ref, best_mv) &&
+         "center address is out of sync with best_mv!\n");
+
   // Returns the one-away integer pel cost/sad around the best as follows:
   // cost_list[0]: cost/sad at the best integer pel
   // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
@@ -1198,8 +1230,6 @@ static int pattern_search(FULLPEL_MV start_mv,
       calc_int_cost_list(*best_mv, ms_params, cost_list);
     }
   }
-  best_mv->row = br;
-  best_mv->col = bc;
 
   const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
   return var_cost;
@@ -1249,6 +1279,15 @@ static int fast_hex_search(const FULLPEL_MV start_mv,
                     cost_list, best_mv);
 }
 
+static int vfast_dia_search(const FULLPEL_MV start_mv,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            const int search_step, const int do_init_search,
+                            int *cost_list, FULLPEL_MV *best_mv) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
+                       do_init_search, cost_list, best_mv);
+}
+
 static int fast_dia_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                            const int search_step, const int do_init_search,
@@ -1692,6 +1731,10 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
       var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
                                best_mv);
       break;
+    case VFAST_DIAMOND:
+      var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+                             best_mv);
+      break;
     case FAST_DIAMOND:
       var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
                             best_mv);
@@ -1769,6 +1812,7 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
       FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
       new_ms_params.sdf = new_ms_params.vfp->sdf;
       new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+      new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
 
       return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
                                    cost_list, best_mv, second_best_mv);
@@ -1946,7 +1990,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   uint8_t const *ref_buf, *src_buf;
   int_mv *best_int_mv = &xd->mi[0]->mv[0];
   unsigned int best_sad, tmp_sad, this_sad[4];
-  const int norm_factor = 3 + (bw >> 5);
+  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+  const int col_norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
   static const MV search_pos[4] = {
@@ -1981,28 +2026,16 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
 
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  for (idx = 0; idx < search_width; idx += 16) {
-    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
-    ref_buf += 16;
-  }
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, search_width, bh, row_norm_factor);
 
   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
-    ref_buf += ref_stride;
-  }
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, search_height,
+                  col_norm_factor);
 
   // Set up src 1-D reference set
-  for (idx = 0; idx < bw; idx += 16) {
-    src_buf = x->plane[0].src.buf + idx;
-    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
-  }
-
   src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
-    src_buf += src_stride;
-  }
+  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
 
   // Find the best match per 1-D search
   best_int_mv->as_fullmv.col =
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 3917d8435..1e8bbab4c 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -137,13 +137,14 @@ typedef struct {
   // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
   aom_sad_fn_t sdf;
   aom_sad_multi_d_fn_t sdx4df;
+  aom_sad_multi_d_fn_t sdx3df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
 void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
-    const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
     int fine_search_interval);
 
@@ -196,9 +197,21 @@ static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
   SQUARE,           // SQUARE
   HEX,              // FAST_HEX
   BIGDIA,           // FAST_DIAMOND
-  BIGDIA            // FAST_BIGDIA
+  BIGDIA,           // FAST_BIGDIA
+  BIGDIA            // VFAST_DIAMOND
 };
 
+// Reinitialize the search site config.
+static AOM_INLINE void av1_refresh_search_site_config(
+    search_site_config *ss_cfg_buf, SEARCH_METHODS search_method,
+    const int ref_stride) {
+  const int level =
+      search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND;
+  search_method = search_method_lookup[search_method];
+  av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method],
+                                              ref_stride, level);
+}
+
 // Mv beyond the range do not produce new/different prediction block.
 static INLINE void av1_set_mv_search_method(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -364,6 +377,15 @@ static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
          (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
 }
 
+static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+  return mv->row * stride + mv->col;
+}
+
+static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+                                                 const FULLPEL_MV *mv) {
+  return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/mcomp_structs.h b/av1/encoder/mcomp_structs.h
index 322021841..3fc1ab868 100644
--- a/av1/encoder/mcomp_structs.h
+++ b/av1/encoder/mcomp_structs.h
@@ -74,6 +74,8 @@ enum {
   FAST_DIAMOND = 8,
   // BIGDIA search with up to 3 stages.
   FAST_BIGDIA = 9,
+  // BIGDIA search with up to 1 stage.
+  VFAST_DIAMOND = 10,
   // Total number of search methods.
   NUM_SEARCH_METHODS,
   // Number of distinct search methods.
diff --git a/av1/encoder/mips/msa/error_msa.c b/av1/encoder/mips/msa/error_msa.c
deleted file mode 100644
index 2e86dee43..000000000
--- a/av1/encoder/mips/msa/error_msa.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
-  static int64_t block_error_##BSize##size_msa(                              \
-      const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
-    int64_t err = 0;                                                         \
-    uint32_t loop_cnt;                                                       \
-    v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
-    v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
-    v2i64 sq_coeff_r, sq_coeff_l;                                            \
-    v2i64 err0, err_dup0, err1, err_dup1;                                    \
-                                                                             \
-    coeff = LD_SH(coeff_ptr);                                                \
-    dq_coeff = LD_SH(dq_coeff_ptr);                                          \
-    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
-    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
-    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
-    DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r,      \
-                sq_coeff_l);                                                 \
-    DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
-                                                                             \
-    coeff = LD_SH(coeff_ptr + 8);                                            \
-    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
-    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
-    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
-    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
-    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
-    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
-                                                                             \
-    coeff_ptr += 16;                                                         \
-    dq_coeff_ptr += 16;                                                      \
-                                                                             \
-    for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
-      coeff = LD_SH(coeff_ptr);                                              \
-      dq_coeff = LD_SH(dq_coeff_ptr);                                        \
-      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
-      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
-      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
-      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
-      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
-                                                                             \
-      coeff = LD_SH(coeff_ptr + 8);                                          \
-      dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
-      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
-      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
-      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
-      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
-      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
-                                                                             \
-      coeff_ptr += 16;                                                       \
-      dq_coeff_ptr += 16;                                                    \
-    }                                                                        \
-                                                                             \
-    err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
-    err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
-    sq_coeff_r += err_dup0;                                                  \
-    sq_coeff_l += err_dup1;                                                  \
-    *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
-    *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
-                                                                             \
-    err_dup0 = __msa_splati_d(err0, 1);                                      \
-    err_dup1 = __msa_splati_d(err1, 1);                                      \
-    err0 += err_dup0;                                                        \
-    err1 += err_dup1;                                                        \
-    err = __msa_copy_s_d(err0, 0);                                           \
-    err += __msa_copy_s_d(err1, 0);                                          \
-                                                                             \
-    return err;                                                              \
-  }
-
-/* clang-format off */
-BLOCK_ERROR_BLOCKSIZE_MSA(16)
-BLOCK_ERROR_BLOCKSIZE_MSA(64)
-BLOCK_ERROR_BLOCKSIZE_MSA(256)
-BLOCK_ERROR_BLOCKSIZE_MSA(1024)
-/* clang-format on */
-
-int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
-                            const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
-                            int64_t *ssz) {
-  int64_t err;
-  const int16_t *coeff = (const int16_t *)coeff_ptr;
-  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
-
-  switch (blk_size) {
-    case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
-    case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
-    case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
-    case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
-    default:
-      err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
-      break;
-  }
-
-  return err;
-}
diff --git a/av1/encoder/mips/msa/fdct4x4_msa.c b/av1/encoder/mips/msa/fdct4x4_msa.c
deleted file mode 100644
index 085c08bfb..000000000
--- a/av1/encoder/mips/msa/fdct4x4_msa.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-
-void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
-                     int32_t src_stride) {
-  v8i16 in0, in1, in2, in3, in4;
-
-  LD_SH4(input, src_stride, in0, in1, in2, in3);
-
-  in0 += in1;
-  in3 -= in2;
-  in4 = (in0 - in3) >> 1;
-  SUB2(in4, in1, in4, in2, in1, in2);
-  in0 -= in2;
-  in3 += in1;
-
-  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
-
-  in0 += in2;
-  in1 -= in3;
-  in4 = (in0 - in1) >> 1;
-  SUB2(in4, in2, in4, in3, in2, in3);
-  in0 -= in3;
-  in1 += in2;
-
-  SLLI_4V(in0, in1, in2, in3, 2);
-
-  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
-
-  ST4x2_UB(in0, output, 4);
-  ST4x2_UB(in3, output + 4, 4);
-  ST4x2_UB(in1, output + 8, 4);
-  ST4x2_UB(in2, output + 12, 4);
-}
diff --git a/av1/encoder/mips/msa/temporal_filter_msa.c b/av1/encoder/mips/msa/temporal_filter_msa.c
deleted file mode 100644
index effa75b83..000000000
--- a/av1/encoder/mips/msa/temporal_filter_msa.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
-                                            uint8_t *frm2_ptr, int32_t filt_sth,
-                                            int32_t filt_wgt, uint32_t *acc,
-                                            uint16_t *cnt) {
-  uint32_t row;
-  uint64_t f0, f1, f2, f3;
-  v16i8 frm2, frm1 = { 0 };
-  v16i8 frm4, frm3 = { 0 };
-  v16u8 frm_r, frm_l;
-  v8i16 frm2_r, frm2_l;
-  v8i16 diff0, diff1, mod0_h, mod1_h;
-  v4i32 cnst3, cnst16, filt_wt, strength;
-  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
-  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
-  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
-  v4i32 acc0, acc1, acc2, acc3;
-  v8i16 cnt0, cnt1;
-
-  filt_wt = __msa_fill_w(filt_wgt);
-  strength = __msa_fill_w(filt_sth);
-  cnst3 = __msa_ldi_w(3);
-  cnst16 = __msa_ldi_w(16);
-
-  for (row = 2; row--;) {
-    LD4(frm1_ptr, stride, f0, f1, f2, f3);
-    frm1_ptr += (4 * stride);
-
-    LD_SB2(frm2_ptr, 16, frm2, frm4);
-    frm2_ptr += 32;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    INSERT_D2_SB(f0, f1, frm1);
-    INSERT_D2_SB(f2, f3, frm3);
-    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-  }
-}
-
-static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
-                                             uint8_t *frm2_ptr,
-                                             int32_t filt_sth, int32_t filt_wgt,
-                                             uint32_t *acc, uint16_t *cnt) {
-  uint32_t row;
-  v16i8 frm1, frm2, frm3, frm4;
-  v16u8 frm_r, frm_l;
-  v16i8 zero = { 0 };
-  v8u16 frm2_r, frm2_l;
-  v8i16 diff0, diff1, mod0_h, mod1_h;
-  v4i32 cnst3, cnst16, filt_wt, strength;
-  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
-  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
-  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
-  v4i32 acc0, acc1, acc2, acc3;
-  v8i16 cnt0, cnt1;
-
-  filt_wt = __msa_fill_w(filt_wgt);
-  strength = __msa_fill_w(filt_sth);
-  cnst3 = __msa_ldi_w(3);
-  cnst16 = __msa_ldi_w(16);
-
-  for (row = 8; row--;) {
-    LD_SB2(frm1_ptr, stride, frm1, frm3);
-    frm1_ptr += stride;
-
-    LD_SB2(frm2_ptr, 16, frm2, frm4);
-    frm2_ptr += 16;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    frm1_ptr += stride;
-    frm2_ptr += 16;
-  }
-}
-
-// TODO(yunqing) The following optimization is not used since c code changes.
-void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
-                                   uint8_t *frame2_ptr, uint32_t blk_w,
-                                   uint32_t blk_h, int32_t strength,
-                                   int32_t filt_wgt, uint32_t *accu,
-                                   uint16_t *cnt) {
-  if (8 == (blk_w * blk_h)) {
-    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
-                                    filt_wgt, accu, cnt);
-  } else if (16 == (blk_w * blk_h)) {
-    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
-                                     filt_wgt, accu, cnt);
-  } else {
-    av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
-                                strength, filt_wgt, accu, cnt);
-  }
-}
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index db5ede494..f7e8b96b5 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -35,13 +35,11 @@ extern "C" {
 #define MODELRD_TYPE_INTRA 1
 #define MODELRD_TYPE_MOTION_MODE_RD 1
 
-typedef void (*model_rd_for_sb_type)(const AV1_COMP *const cpi,
-                                     BLOCK_SIZE bsize, MACROBLOCK *x,
-                                     MACROBLOCKD *xd, int plane_from,
-                                     int plane_to, int *out_rate_sum,
-                                     int64_t *out_dist_sum, int *skip_txfm_sb,
-                                     int64_t *skip_sse_sb, int *plane_rate,
-                                     int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_for_sb_type)(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist);
 typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
                                        const MACROBLOCK *const x,
                                        BLOCK_SIZE plane_bsize, int plane,
@@ -160,7 +158,7 @@ static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
 static AOM_INLINE void model_rd_for_sb(
     const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
     int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
-    int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
     int64_t *plane_sse, int64_t *plane_dist) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
@@ -212,7 +210,7 @@ static AOM_INLINE void model_rd_for_sb(
 static AOM_INLINE void model_rd_for_sb_with_curvfit(
     const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
     int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
-    int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
     int64_t *plane_sse, int64_t *plane_dist) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 2a2ad2e27..81232ef9d 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -221,9 +221,8 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   // MotionVectorSearchParams::search_site_cfg. When this happens, we need to
   // readjust the stride.
   const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
-  const int ref_stride = xd->plane[0].pre[0].stride;
-  const search_site_config *src_search_site_cfg = av1_get_search_site_config(
-      x->search_site_cfg_buf, mv_search_params, search_method, ref_stride);
+  const search_site_config *src_search_site_cfg =
+      av1_get_search_site_config(cpi, x, search_method);
 
   // Further reduce the search range.
   if (search_range < INT_MAX) {
@@ -597,10 +596,8 @@ int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     // Make motion search params
     FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
     const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
-    const int ref_stride = xd->plane[0].pre[0].stride;
-    const search_site_config *src_search_sites = av1_get_search_site_config(
-        x->search_site_cfg_buf, &cpi->mv_search_params, search_method,
-        ref_stride);
+    const search_site_config *src_search_sites =
+        av1_get_search_site_config(cpi, x, search_method);
     av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
                                        &ref_mv[id].as_mv, src_search_sites,
                                        /*fine_search_interval=*/0);
@@ -748,10 +745,8 @@ int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   // Make motion search params
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
-  const int ref_stride = xd->plane[0].pre[0].stride;
   const search_site_config *src_search_sites =
-      av1_get_search_site_config(x->search_site_cfg_buf, &cpi->mv_search_params,
-                                 search_method, ref_stride);
+      av1_get_search_site_config(cpi, x, search_method);
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
                                      &ref_mv.as_mv, src_search_sites,
                                      /*fine_search_interval=*/0);
@@ -970,10 +965,8 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
   const int fine_search_interval = use_fine_search_interval(cpi);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
-  const int ref_stride = xd->plane[0].pre[0].stride;
   const search_site_config *src_search_sites =
-      av1_get_search_site_config(x->search_site_cfg_buf, &cpi->mv_search_params,
-                                 search_method, ref_stride);
+      av1_get_search_site_config(cpi, x, search_method);
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
                                      src_search_sites, fine_search_interval);
 
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index bc69a6574..4d762876c 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -71,9 +71,13 @@ int_mv av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
                                  unsigned int *sse, unsigned int *var);
 
 static AOM_INLINE const search_site_config *av1_get_search_site_config(
-    search_site_config *ss_cfg_buf,
-    const MotionVectorSearchParams *mv_search_params,
-    SEARCH_METHODS search_method, const int ref_stride) {
+    const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
+  const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+  // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+  // that's shared by multiple threads. In most cases where all frames have the
+  // same resolution, the cache contains the search site config that we need.
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
   if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) {
     return mv_search_params->search_site_cfg[SS_CFG_SRC];
   } else if (ref_stride ==
@@ -81,15 +85,18 @@ static AOM_INLINE const search_site_config *av1_get_search_site_config(
     return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD];
   }
 
-  if (ref_stride != ss_cfg_buf[search_method].stride) {
-    const int level =
-        search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND;
-    search_method = search_method_lookup[search_method];
-    av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method],
-                                                ref_stride, level);
+  // If the cache does not contain the correct stride, then we will need to rely
+  // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+  // thread level config doesn't match, then we need to update it.
+  search_method = search_method_lookup[search_method];
+  assert(search_method_lookup[search_method] == search_method &&
+         "The search_method_lookup table should be idempotent.");
+  if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+    av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+                                   ref_stride);
   }
 
-  return ss_cfg_buf;
+  return x->search_site_cfg_buf;
 }
 
 #ifdef __cplusplus
diff --git a/av1/encoder/nonrd_opt.h b/av1/encoder/nonrd_opt.h
index 39049e592..0d0db81d2 100644
--- a/av1/encoder/nonrd_opt.h
+++ b/av1/encoder/nonrd_opt.h
@@ -14,6 +14,37 @@
 
 #include "av1/encoder/rdopt_utils.h"
 
+#define RTC_INTER_MODES (4)
+#define RTC_INTRA_MODES (4)
+#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES))
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+                                                   SMOOTH_PRED };
+
+static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV,
+                                                   NEWMV };
+
+static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = {
+  { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
+  { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
+  { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
+  { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
+  { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG },
+  { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB },
+  { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 },
+  { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA },
+};
+
+// Indicates the blocks for which RD model should be based on special logic
+static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                                    BLOCK_SIZE bsize) {
+  const int large_block = bsize >= BLOCK_32X32;
+  const AV1_COMMON *const cm = &cpi->common;
+  return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
+         !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+         cm->quant_params.base_qindex &&
+         cm->seq_params->bit_depth == AOM_BITS_8;
+}
 /*!\brief Finds predicted motion vectors for a block.
  *
  * \ingroup nonrd_mode_search
@@ -37,7 +68,7 @@
  *                                        prune for low temporal variance block
  * \param[in]    skip_pred_mv             Flag indicating to skip av1_mv_pred
  *
- * \return Nothing is returned. Instead, predicted MVs are placed into
+ * \remark Nothing is returned. Instead, predicted MVs are placed into
  * \c frame_mv array
  */
 static INLINE void find_predictors(
@@ -80,7 +111,9 @@ static INLINE void find_predictors(
                   bsize);
     }
   }
-  av1_count_overlappable_neighbors(cm, xd);
+  if (cm->features.switchable_motion_mode) {
+    av1_count_overlappable_neighbors(cm, xd);
+  }
   mbmi->num_proj_ref = 1;
 }
 
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 58fc07a47..9c14bb17e 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -38,6 +38,7 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
+#define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3)
 extern int g_pick_inter_mode_cnt;
 /*!\cond */
 typedef struct {
@@ -78,41 +79,38 @@ typedef struct {
   InterpFilter filter_x;
   InterpFilter filter_y;
 } INTER_FILTER;
+
+/*!\brief Structure to store parameters and statistics used in non-rd inter mode
+ * evaluation.
+ */
+typedef struct {
+  BEST_PICKMODE best_pickmode;
+  RD_STATS this_rdc;
+  RD_STATS best_rdc;
+  int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES];
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  unsigned int vars[RTC_INTER_MODES][REF_FRAMES];
+  unsigned int ref_costs_single[REF_FRAMES];
+  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
+  int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES];
+  int use_ref_frame_mask[REF_FRAMES];
+  uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+} InterModeSearchStateNonrd;
 /*!\endcond */
 
-#define NUM_INTER_MODES_RT 9
 #define NUM_COMP_INTER_MODES_RT (6)
-#define NUM_INTER_MODES_REDUCED 8
-#define RTC_INTER_MODES (4)
-#define RTC_INTRA_MODES (4)
-#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES))
-
-static const REF_MODE ref_mode_set_rt[NUM_INTER_MODES_RT] = {
-  { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
-  { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEARESTMV },
-  { GOLDEN_FRAME, NEARMV },    { GOLDEN_FRAME, NEWMV },
-  { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
-  { ALTREF_FRAME, NEWMV }
-};
+#define NUM_INTER_MODES 12
 
 // GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
 // mode
-static const REF_MODE ref_mode_set_reduced[NUM_INTER_MODES_REDUCED] = {
+static const REF_MODE ref_mode_set[NUM_INTER_MODES] = {
   { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
   { LAST_FRAME, GLOBALMV },    { LAST_FRAME, NEWMV },
   { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
   { GOLDEN_FRAME, GLOBALMV },  { GOLDEN_FRAME, NEWMV },
-};
-
-static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = {
-  { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
-  { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
-  { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
-  { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
-  { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG },
-  { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB },
-  { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 },
-  { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA },
+  { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
+  { ALTREF_FRAME, GLOBALMV },  { ALTREF_FRAME, NEWMV },
 };
 
 static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = {
@@ -124,9 +122,6 @@ static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = {
   { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV },
 };
 
-static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
-                                                   SMOOTH_PRED };
-
 static const INTER_FILTER filters_ref_set[9] = {
   { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
   { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },  { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
@@ -135,20 +130,6 @@ static const INTER_FILTER filters_ref_set[9] = {
   { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
 };
 
-static INLINE int mode_offset(const PREDICTION_MODE mode) {
-  if (mode >= NEARESTMV) {
-    return INTER_OFFSET(mode);
-  } else {
-    switch (mode) {
-      case DC_PRED: return 0;
-      case V_PRED: return 1;
-      case H_PRED: return 2;
-      case SMOOTH_PRED: return 3;
-      default: assert(0); return -1;
-    }
-  }
-}
-
 enum {
   //  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
   INTER_NEAREST = (1 << NEARESTMV),
@@ -157,6 +138,137 @@ enum {
   INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
 };
 
+// The original scan order (default_scan_8x8) is modified according to the extra
+// transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and
+// aom_hadamard_8x8_c.
+static const int16_t default_scan_8x8_transpose[64] = {
+  0,  8,  1,  2,  9,  16, 24, 17, 10, 3,  4,  11, 18, 25, 32, 40,
+  33, 26, 19, 12, 5,  6,  13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+  28, 21, 14, 7,  15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+  23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
+};
+
+// The original scan order (av1_default_iscan_8x8) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and
+// aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the
+// order of coefficients, such that the normal scan order is no longer
+// guaranteed to scan low coefficients first, therefore we modify the scan order
+// accordingly.
+// Note that this one has to be used together with default_scan_8x8_transpose.
+static const int16_t av1_default_iscan_8x8_transpose[64] = {
+  0,  2,  3,  9,  10, 20, 21, 35, 1,  4,  8,  11, 19, 22, 34, 36,
+  5,  7,  12, 18, 23, 33, 37, 48, 6,  13, 17, 24, 32, 38, 47, 49,
+  14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58,
+  27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63
+};
+
+// The original scan order (default_scan_16x16) is modified according to the
+// extra transpose in hadamard c implementation in lp case, i.e.,
+// aom_hadamard_lp_16x16_c.
+static const int16_t default_scan_lp_16x16_transpose[256] = {
+  0,   8,   2,   4,   10,  16,  24,  18,  12,  6,   64,  14,  20,  26,  32,
+  40,  34,  28,  22,  72,  66,  68,  74,  80,  30,  36,  42,  48,  56,  50,
+  44,  38,  88,  82,  76,  70,  128, 78,  84,  90,  96,  46,  52,  58,  1,
+  9,   3,   60,  54,  104, 98,  92,  86,  136, 130, 132, 138, 144, 94,  100,
+  106, 112, 62,  5,   11,  17,  25,  19,  13,  7,   120, 114, 108, 102, 152,
+  146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65,  15,  21,  27,
+  33,  41,  35,  29,  23,  73,  67,  124, 118, 168, 162, 156, 150, 200, 194,
+  196, 202, 208, 158, 164, 170, 176, 126, 69,  75,  81,  31,  37,  43,  49,
+  57,  51,  45,  39,  89,  83,  77,  71,  184, 178, 172, 166, 216, 210, 204,
+  198, 206, 212, 218, 224, 174, 180, 186, 129, 79,  85,  91,  97,  47,  53,
+  59,  61,  55,  105, 99,  93,  87,  137, 131, 188, 182, 232, 226, 220, 214,
+  222, 228, 234, 240, 190, 133, 139, 145, 95,  101, 107, 113, 63,  121, 115,
+  109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143,
+  149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252,
+  246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217,
+  211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221,
+  215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247,
+  255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (default_scan_16x16) is modified according to the
+// extra shift in hadamard c implementation in fp case, i.e.,
+// aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different
+// outputs, so we handle them separately.
+static const int16_t default_scan_fp_16x16_transpose[256] = {
+  0,   4,   2,   8,   6,   16,  20,  18,  12,  10,  64,  14,  24,  22,  32,
+  36,  34,  28,  26,  68,  66,  72,  70,  80,  30,  40,  38,  48,  52,  50,
+  44,  42,  84,  82,  76,  74,  128, 78,  88,  86,  96,  46,  56,  54,  1,
+  5,   3,   60,  58,  100, 98,  92,  90,  132, 130, 136, 134, 144, 94,  104,
+  102, 112, 62,  9,   7,   17,  21,  19,  13,  11,  116, 114, 108, 106, 148,
+  146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65,  15,  25,  23,
+  33,  37,  35,  29,  27,  69,  67,  124, 122, 164, 162, 156, 154, 196, 194,
+  200, 198, 208, 158, 168, 166, 176, 126, 73,  71,  81,  31,  41,  39,  49,
+  53,  51,  45,  43,  85,  83,  77,  75,  180, 178, 172, 170, 212, 210, 204,
+  202, 206, 216, 214, 224, 174, 184, 182, 129, 79,  89,  87,  97,  47,  57,
+  55,  61,  59,  101, 99,  93,  91,  133, 131, 188, 186, 228, 226, 220, 218,
+  222, 232, 230, 240, 190, 137, 135, 145, 95,  105, 103, 113, 63,  117, 115,
+  109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143,
+  153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252,
+  250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213,
+  211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221,
+  219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251,
+  255
+};
+#endif
+
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_lp_16x16_transpose.
+static const int16_t av1_default_iscan_lp_16x16_transpose[256] = {
+  0,   44,  2,   46,  3,   63,  9,   69,  1,   45,  4,   64,  8,   68,  11,
+  87,  5,   65,  7,   67,  12,  88,  18,  94,  6,   66,  13,  89,  17,  93,
+  24,  116, 14,  90,  16,  92,  25,  117, 31,  123, 15,  91,  26,  118, 30,
+  122, 41,  148, 27,  119, 29,  121, 42,  149, 48,  152, 28,  120, 43,  150,
+  47,  151, 62,  177, 10,  86,  20,  96,  21,  113, 35,  127, 19,  95,  22,
+  114, 34,  126, 37,  144, 23,  115, 33,  125, 38,  145, 52,  156, 32,  124,
+  39,  146, 51,  155, 58,  173, 40,  147, 50,  154, 59,  174, 73,  181, 49,
+  153, 60,  175, 72,  180, 83,  198, 61,  176, 71,  179, 84,  199, 98,  202,
+  70,  178, 85,  200, 97,  201, 112, 219, 36,  143, 54,  158, 55,  170, 77,
+  185, 53,  157, 56,  171, 76,  184, 79,  194, 57,  172, 75,  183, 80,  195,
+  102, 206, 74,  182, 81,  196, 101, 205, 108, 215, 82,  197, 100, 204, 109,
+  216, 131, 223, 99,  203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221,
+  141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78,  193, 104,
+  208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214,
+  133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139,
+  231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250,
+  168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211,
+  255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_fp_16x16_transpose.
+static const int16_t av1_default_iscan_fp_16x16_transpose[256] = {
+  0,   44,  2,   46,  1,   45,  4,   64,  3,   63,  9,   69,  8,   68,  11,
+  87,  5,   65,  7,   67,  6,   66,  13,  89,  12,  88,  18,  94,  17,  93,
+  24,  116, 14,  90,  16,  92,  15,  91,  26,  118, 25,  117, 31,  123, 30,
+  122, 41,  148, 27,  119, 29,  121, 28,  120, 43,  150, 42,  149, 48,  152,
+  47,  151, 62,  177, 10,  86,  20,  96,  19,  95,  22,  114, 21,  113, 35,
+  127, 34,  126, 37,  144, 23,  115, 33,  125, 32,  124, 39,  146, 38,  145,
+  52,  156, 51,  155, 58,  173, 40,  147, 50,  154, 49,  153, 60,  175, 59,
+  174, 73,  181, 72,  180, 83,  198, 61,  176, 71,  179, 70,  178, 85,  200,
+  84,  199, 98,  202, 97,  201, 112, 219, 36,  143, 54,  158, 53,  157, 56,
+  171, 55,  170, 77,  185, 76,  184, 79,  194, 57,  172, 75,  183, 74,  182,
+  81,  196, 80,  195, 102, 206, 101, 205, 108, 215, 82,  197, 100, 204, 99,
+  203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221,
+  128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78,  193, 104,
+  208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214,
+  133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139,
+  231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250,
+  168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211,
+  255
+};
+#endif
+
 static INLINE int early_term_inter_search_with_sse(int early_term_idx,
                                                    BLOCK_SIZE bsize,
                                                    int64_t this_sse,
@@ -205,19 +317,70 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
   memset(&bp->pmi, 0, sizeof(bp->pmi));
 }
 
-static INLINE int subpel_select(AV1_COMP *cpi, BLOCK_SIZE bsize, int_mv *mv) {
-  int mv_thresh = 4;
-  const int is_low_resoln =
-      (cpi->common.width * cpi->common.height <= 320 * 240);
-  mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
-  if (cpi->rc.avg_frame_low_motion > 0 && cpi->rc.avg_frame_low_motion < 40)
-    mv_thresh = 12;
-  mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
-  if (abs(mv->as_fullmv.row) >= mv_thresh ||
-      abs(mv->as_fullmv.col) >= mv_thresh)
-    return HALF_PEL;
-  else
-    return cpi->sf.mv_sf.subpel_force_stop;
+static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                int_mv *mv, MV ref_mv, FULLPEL_MV start_mv,
+                                bool fullpel_performed_well) {
+  const int frame_lowmotion = cpi->rc.avg_frame_low_motion;
+  // Reduce MV precision for higher int MV value & frame-level motion
+  if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion >= 3) {
+    int mv_thresh = 4;
+    const int is_low_resoln =
+        (cpi->common.width * cpi->common.height <= 320 * 240);
+    mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
+    if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12;
+    mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
+    if (abs(mv->as_fullmv.row) >= mv_thresh ||
+        abs(mv->as_fullmv.col) >= mv_thresh)
+      return HALF_PEL;
+  } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion >= 1) {
+    int mv_thresh;
+    const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } };
+    const int th_idx = cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion - 1;
+    assert(th_idx >= 0 && th_idx < 2);
+    if (frame_lowmotion > 0 && frame_lowmotion < 40)
+      mv_thresh = 12;
+    else
+      mv_thresh = (bsize >= BLOCK_32X32)   ? th_vals[th_idx][0]
+                  : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1]
+                                           : th_vals[th_idx][2];
+    if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) ||
+        abs(mv->as_fullmv.col) >= (mv_thresh << 1))
+      return FULL_PEL;
+    else if (abs(mv->as_fullmv.row) >= mv_thresh ||
+             abs(mv->as_fullmv.col) >= mv_thresh)
+      return HALF_PEL;
+  }
+  // Reduce MV precision for relatively static (e.g. background), low-complex
+  // large areas
+  if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) {
+    const int qband = x->qindex >> (QINDEX_BITS - 2);
+    assert(qband < 4);
+    if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad &&
+        bsize > BLOCK_16X16 && qband != 0) {
+      if (x->source_variance < 500)
+        return FULL_PEL;
+      else if (x->source_variance < 5000)
+        return HALF_PEL;
+    }
+  } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) {
+    if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 &&
+        start_mv.row == 0 && start_mv.col == 0)
+      return HALF_PEL;
+  }
+  return cpi->sf.mv_sf.subpel_force_stop;
+}
+
+static bool use_aggressive_subpel_search_method(
+    MACROBLOCK *x, bool use_adaptive_subpel_search,
+    const bool fullpel_performed_well) {
+  if (!use_adaptive_subpel_search) return false;
+  const int qband = x->qindex >> (QINDEX_BITS - 2);
+  assert(qband < 4);
+  if ((qband > 0) && (fullpel_performed_well ||
+                      (x->content_state_sb.source_sad_nonrd <= kLowSad) ||
+                      (x->source_variance < 100)))
+    return true;
+  return false;
 }
 
 /*!\brief Runs Motion Estimation for a specific block and specific ref frame.
@@ -251,10 +414,11 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *sf = &cpi->sf;
   MB_MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  int step_param = (cpi->sf.rt_sf.fullpel_search_step_param)
-                       ? cpi->sf.rt_sf.fullpel_search_step_param
+  int step_param = (sf->rt_sf.fullpel_search_step_param)
+                       ? sf->rt_sf.fullpel_search_step_param
                        : cpi->mv_search_params.mv_step_param;
   FULLPEL_MV start_mv;
   const int ref = mi->ref_frame[0];
@@ -284,11 +448,9 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
   else
     center_mv = tmp_mv->as_mv;
 
-  const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
-  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
-  const int ref_stride = xd->plane[0].pre[0].stride;
-  const search_site_config *src_search_sites = av1_get_search_site_config(
-      x->search_site_cfg_buf, mv_search_params, search_method, ref_stride);
+  const SEARCH_METHODS search_method = sf->mv_sf.search_method;
+  const search_site_config *src_search_sites =
+      av1_get_search_site_config(cpi, x, search_method);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
                                      src_search_sites,
@@ -311,28 +473,26 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
                                       cost_list);
-    if (cpi->sf.rt_sf.force_half_pel_block &&
-        cpi->sf.mv_sf.subpel_force_stop < HALF_PEL)
-      ms_params.forced_stop = subpel_select(cpi, bsize, tmp_mv);
-    if (cpi->sf.rt_sf.reduce_zeromv_mvres && ref_mv.row == 0 &&
-        ref_mv.col == 0 && start_mv.row == 0 && start_mv.col == 0) {
-      // If both the refmv and the fullpel results show zero mv, then there is
-      // high likelihood that the current block is static. So we can try to
-      // reduce the mv resolution here.
-      // These thresholds are the mean var rd collected from multiple encoding
-      // runs.
-      if ((bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
-          (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
-          (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127)) {
-        ms_params.forced_stop = HALF_PEL;
-      }
-    }
+    const bool fullpel_performed_well =
+        (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
+        (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
+        (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127);
+    if (sf->rt_sf.reduce_mv_pel_precision_highmotion ||
+        sf->rt_sf.reduce_mv_pel_precision_lowcomplex)
+      ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv,
+                                            start_mv, fullpel_performed_well);
 
     MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
-    cpi->mv_search_params.find_fractional_mv_step(
-        xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
-        &x->pred_sse[ref], NULL);
-
+    // adaptively downgrade subpel search method based on block properties
+    if (use_aggressive_subpel_search_method(
+            x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
+      av1_find_best_sub_pixel_tree_pruned_more(xd, cm, &ms_params,
+                                               subpel_start_mv, &tmp_mv->as_mv,
+                                               &dis, &x->pred_sse[ref], NULL);
+    else
+      cpi->mv_search_params.find_fractional_mv_step(
+          xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
+          &x->pred_sse[ref], NULL);
     *rate_mv =
         av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
                         x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
@@ -342,8 +502,8 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
   }
-  // Final MV can not be equal to referance MV as this will trigger assert
-  // later. This can happen if both NEAREST and NEAR modes were skipped
+  // The final MV can not be equal to the reference MV as this will trigger an
+  // assert later. This can happen if both NEAREST and NEAR modes were skipped.
   rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
   return rv;
 }
@@ -407,15 +567,26 @@ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
 
     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
-    if (cpi->sf.rt_sf.force_half_pel_block &&
-        cpi->sf.mv_sf.subpel_force_stop < HALF_PEL)
-      ms_params.forced_stop = subpel_select(cpi, bsize, &best_mv);
+    if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion ||
+        cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) {
+      FULLPEL_MV start_mv = { .row = 0, .col = 0 };
+      ms_params.forced_stop =
+          subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false);
+    }
     MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     cpi->mv_search_params.find_fractional_mv_step(
         xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
         &x->pred_sse[ref_frame], NULL);
     frame_mv[NEWMV][ref_frame].as_int = best_mv.as_int;
 
+    // When NEWMV is same as ref_mv from the drl, it is preferred to code the
+    // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to
+    // avoid an assert failure at a later stage. The scenario can occur if
+    // NEARESTMV was not evaluated for ALTREF.
+    if (frame_mv[NEWMV][ref_frame].as_mv.col == ref_mv.col &&
+        frame_mv[NEWMV][ref_frame].as_mv.row == ref_mv.row)
+      return -1;
+
     *rate_mv = av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
                                x->mv_costs->nmv_joint_cost,
                                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
@@ -459,6 +630,29 @@ static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
   }
 }
 
+static INLINE void set_force_skip_flag(const AV1_COMP *const cpi,
+                                       MACROBLOCK *const x, unsigned int sse,
+                                       int *force_skip) {
+  if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT &&
+      cpi->sf.rt_sf.tx_size_level_based_on_qstep &&
+      cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+    const int qstep = x->plane[0].dequant_QTX[1] >> (x->e_mbd.bd - 5);
+    const unsigned int qstep_sq = qstep * qstep;
+    // If the sse is low for low source variance blocks, mark those as
+    // transform skip.
+    // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+    // low so that reliable early estimate of tx skip can be obtained
+    // through its comparison with sse.
+    if (sse < qstep_sq && x->source_variance < qstep_sq &&
+        x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0)
+      *force_skip = 1;
+  }
+}
+
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+  (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+
 static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                  MACROBLOCK *const x, unsigned int var,
                                  unsigned int sse, int *force_skip) {
@@ -512,8 +706,8 @@ static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
   }
 
-  if (txfm_params->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
-    tx_size = TX_16X16;
+  if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize))
+    tx_size = TX_SIZE_FOR_BSIZE_GT32;
 
   return AOMMIN(tx_size, TX_16X16);
 }
@@ -534,31 +728,45 @@ static void block_variance(const uint8_t *src, int src_stride,
   *sum = 0;
 
   // This function is called for block sizes >= BLOCK_32x32. As per the design
-  // the aom_get_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32) per
-  // call. Hence the width and height of the block need to be at least 8 and 32
-  // samples respectively.
+  // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32)
+  // per call. Hence the width and height of the block need to be at least 8 and
+  // 32 samples respectively.
   assert(w >= 32);
   assert(h >= 8);
   for (int i = 0; i < h; i += block_size) {
     for (int j = 0; j < w; j += 32) {
-      aom_get_sse_sum_8x8_quad(src + src_stride * i + j, src_stride,
-                               ref + ref_stride * i + j, ref_stride, &sse8x8[k],
-                               &sum8x8[k]);
-
-      *sse += sse8x8[k] + sse8x8[k + 1] + sse8x8[k + 2] + sse8x8[k + 3];
-      *sum += sum8x8[k] + sum8x8[k + 1] + sum8x8[k + 2] + sum8x8[k + 3];
-      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
-      var8x8[k + 1] = sse8x8[k + 1] -
-                      (uint32_t)(((int64_t)sum8x8[k + 1] * sum8x8[k + 1]) >> 6);
-      var8x8[k + 2] = sse8x8[k + 2] -
-                      (uint32_t)(((int64_t)sum8x8[k + 2] * sum8x8[k + 2]) >> 6);
-      var8x8[k + 3] = sse8x8[k + 3] -
-                      (uint32_t)(((int64_t)sum8x8[k + 3] * sum8x8[k + 3]) >> 6);
+      aom_get_var_sse_sum_8x8_quad(
+          src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+          ref_stride, &sse8x8[k], &sum8x8[k], sse, sum, &var8x8[k]);
       k += 4;
     }
   }
 }
 
+static void block_variance_16x16_dual(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int w,
+                                      int h, unsigned int *sse, int *sum,
+                                      int block_size, uint32_t *sse16x16,
+                                      uint32_t *var16x16) {
+  int k = 0;
+  *sse = 0;
+  *sum = 0;
+  // This function is called for block sizes >= BLOCK_32x32. As per the design
+  // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a
+  // 16x32) per call. Hence the width and height of the block need to be at
+  // least 16 and 32 samples respectively.
+  assert(w >= 32);
+  assert(h >= 16);
+  for (int i = 0; i < h; i += block_size) {
+    for (int j = 0; j < w; j += 32) {
+      aom_get_var_sse_sum_16x16_dual(src + src_stride * i + j, src_stride,
+                                     ref + ref_stride * i + j, ref_stride,
+                                     &sse16x16[k], sse, sum, &var16x16[k]);
+      k += 2;
+    }
+  }
+}
+
 static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
                                unsigned int *sse_i, int *sum_i,
                                unsigned int *var_o, unsigned int *sse_o,
@@ -594,6 +802,172 @@ static int ac_thr_factor(const int speed, const int width, const int height,
   return 1;
 }
 
+// Sets early_term flag based on chroma planes prediction
+static INLINE void set_early_term_based_on_uv_plane(
+    AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row,
+    int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx,
+    const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct macroblock_plane *const p = &x->plane[0];
+  const uint32_t dc_quant = p->dequant_QTX[0];
+  const uint32_t ac_quant = p->dequant_QTX[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  int64_t ac_thr = ac_quant * ac_quant >> 6;
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  int ac_test = 1;
+  int dc_test = 1;
+  const int norm_sum = abs(sum) >> (bw + bh);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5)
+    ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+                                     norm_sum, cpi->svc.temporal_layer_id);
+  else
+    ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+#else
+  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+
+#endif
+
+  for (int k = 0; k < num_blk; k++) {
+    // Check if all ac coefficients can be quantized to zero.
+    if (!(var_tx[k] < ac_thr || var == 0)) {
+      ac_test = 0;
+      break;
+    }
+    // Check if dc coefficient can be quantized to zero.
+    if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+      dc_test = 0;
+      break;
+    }
+  }
+
+  // Check if chroma can be skipped based on ac and dc test flags.
+  if (ac_test && dc_test) {
+    int skip_uv[2] = { 0 };
+    unsigned int var_uv[2];
+    unsigned int sse_uv[2];
+    // Transform skipping test in UV planes.
+    for (int i = 1; i <= 2; i++) {
+      int j = i - 1;
+      skip_uv[j] = 1;
+      if (x->color_sensitivity[j]) {
+        skip_uv[j] = 0;
+        struct macroblock_plane *const puv = &x->plane[i];
+        struct macroblockd_plane *const puvd = &xd->plane[i];
+        const BLOCK_SIZE uv_bsize = get_plane_block_size(
+            bsize, puvd->subsampling_x, puvd->subsampling_y);
+        // Adjust these thresholds for UV.
+        const int64_t uv_dc_thr =
+            (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3;
+        const int64_t uv_ac_thr =
+            (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
+                                      i);
+        var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
+                                                  puvd->dst.buf,
+                                                  puvd->dst.stride, &sse_uv[j]);
+        if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+            (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+          skip_uv[j] = 1;
+        else
+          break;
+      }
+    }
+    if (skip_uv[0] & skip_uv[1]) {
+      *early_term = 1;
+    }
+  }
+}
+
+static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
+                                              RD_STATS *rd_stats,
+                                              int calculate_rd, int *early_term,
+                                              BLOCK_SIZE bsize,
+                                              unsigned int sse) {
+  if (calculate_rd) {
+    if (!*early_term) {
+      const int bw = block_size_wide[bsize];
+      const int bh = block_size_high[bsize];
+
+      model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh,
+                            &rd_stats->rate, &rd_stats->dist);
+    }
+
+    if (*early_term) {
+      rd_stats->rate = 0;
+      rd_stats->dist = sse << 4;
+    }
+  }
+}
+
+static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                         int mi_row, int mi_col, MACROBLOCK *x,
+                                         MACROBLOCKD *xd, RD_STATS *rd_stats,
+                                         int *early_term, int calculate_rd,
+                                         int64_t best_sse,
+                                         unsigned int *var_output,
+                                         unsigned int var_prune_threshold) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  int test_skip = 1;
+  unsigned int var;
+  int sum;
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  unsigned int sse16x16[64] = { 0 };
+  unsigned int var16x16[64] = { 0 };
+  assert(xd->mi[0]->tx_size == TX_16X16);
+  assert(bsize > BLOCK_32X32);
+
+  // Calculate variance for whole partition, and also save 16x16 blocks'
+  // variance to be used in following transform skipping test.
+  block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16,
+                            sse16x16, var16x16);
+
+  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  if (var_output) {
+    *var_output = var;
+    if (*var_output > var_prune_threshold) {
+      return;
+    }
+  }
+
+  rd_stats->sse = sse;
+  // Skipping test
+  *early_term = 0;
+  set_force_skip_flag(cpi, x, sse, early_term);
+  // The code below for setting skip flag assumes transform size of at least
+  // 8x8, so force this lower limit on transform.
+  MB_MODE_INFO *const mi = xd->mi[0];
+  if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+      early_term_inter_search_with_sse(
+          cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+          mi->mode))
+    test_skip = 0;
+
+  if (*early_term) test_skip = 0;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  if (test_skip) {
+    const unsigned int *sse_tx = sse16x16;
+    const unsigned int *var_tx = var16x16;
+    const unsigned int num_block = (1 << (bw + bh - 2)) >> 2;
+    set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+                                     early_term, num_block, sse_tx, var_tx, sum,
+                                     var, sse);
+  }
+  calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+                             sse);
+}
+
 static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
                                       int mi_row, int mi_col, MACROBLOCK *x,
                                       MACROBLOCKD *xd, RD_STATS *rd_stats,
@@ -601,36 +975,43 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
                                       int64_t best_sse,
                                       unsigned int *var_output,
                                       unsigned int var_prune_threshold) {
+  if (x->force_zeromv_skip_for_blk) {
+    *early_term = 1;
+    rd_stats->rate = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    return;
+  }
+
+  // For block sizes greater than 32x32, the transform size is always 16x16.
+  // This function avoids calling calculate_variance() for tx_size 16x16 cases
+  // by directly populating variance at tx_size level from
+  // block_variance_16x16_dual() function.
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) {
+    xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32;
+    model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats,
+                                 early_term, calculate_rd, best_sse, var_output,
+                                 var_prune_threshold);
+    return;
+  }
+
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
   unsigned int sse;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const uint32_t dc_quant = p->dequant_QTX[0];
-  const uint32_t ac_quant = p->dequant_QTX[1];
-  const int64_t dc_thr = dc_quant * dc_quant >> 6;
-  int64_t ac_thr = ac_quant * ac_quant >> 6;
   int test_skip = 1;
   unsigned int var;
   int sum;
 
   const int bw = b_width_log2_lookup[bsize];
   const int bh = b_height_log2_lookup[bsize];
-  const int num8x8 = 1 << (bw + bh - 2);
   unsigned int sse8x8[256] = { 0 };
   int sum8x8[256] = { 0 };
   unsigned int var8x8[256] = { 0 };
   TX_SIZE tx_size;
-  int k;
-
-  if (x->force_zeromv_skip) {
-    *early_term = 1;
-    rd_stats->rate = 0;
-    rd_stats->dist = 0;
-    rd_stats->sse = 0;
-    return;
-  }
 
   // Calculate variance for whole partition, and also save 8x8 blocks' variance
   // to be used in following transform skipping test.
@@ -645,26 +1026,12 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
   }
 
   rd_stats->sse = sse;
-
-#if CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
-      cpi->oxcf.speed > 5)
-    ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
-                                     (abs(sum) >> (bw + bh)),
-                                     cpi->svc.temporal_layer_id);
-  else
-    ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
-                            cpi->common.height, abs(sum) >> (bw + bh));
-#else
-  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
-                          cpi->common.height, abs(sum) >> (bw + bh));
-
-#endif
   // Skipping test
   *early_term = 0;
   tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term);
-  // The code below for setting skip flag assumes tranform size of at least 8x8,
-  // so force this lower limit on transform.
+  assert(tx_size <= TX_16X16);
+  // The code below for setting skip flag assumes transform size of at least
+  // 8x8, so force this lower limit on transform.
   if (tx_size < TX_8X8) tx_size = TX_8X8;
   xd->mi[0]->tx_size = tx_size;
 
@@ -682,103 +1049,36 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
     unsigned int sse16x16[64] = { 0 };
     int sum16x16[64] = { 0 };
     unsigned int var16x16[64] = { 0 };
-    const int num16x16 = num8x8 >> 2;
-
-    unsigned int sse32x32[16] = { 0 };
-    int sum32x32[16] = { 0 };
-    unsigned int var32x32[16] = { 0 };
-    const int num32x32 = num8x8 >> 4;
-
-    int ac_test = 1;
-    int dc_test = 1;
-    const int num = (tx_size == TX_8X8)
-                        ? num8x8
-                        : ((tx_size == TX_16X16) ? num16x16 : num32x32);
-    const unsigned int *sse_tx =
-        (tx_size == TX_8X8) ? sse8x8
-                            : ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
-    const unsigned int *var_tx =
-        (tx_size == TX_8X8) ? var8x8
-                            : ((tx_size == TX_16X16) ? var16x16 : var32x32);
-
-    // Calculate variance if tx_size > TX_8X8
-    if (tx_size >= TX_16X16)
+    const unsigned int *sse_tx = sse8x8;
+    const unsigned int *var_tx = var8x8;
+    unsigned int num_blks = 1 << (bw + bh - 2);
+
+    if (tx_size >= TX_16X16) {
       calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
                          sum16x16);
-    if (tx_size == TX_32X32)
-      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
-                         sse32x32, sum32x32);
-
-    for (k = 0; k < num; k++)
-      // Check if all ac coefficients can be quantized to zero.
-      if (!(var_tx[k] < ac_thr || var == 0)) {
-        ac_test = 0;
-        break;
-      }
-
-    for (k = 0; k < num; k++)
-      // Check if dc coefficient can be quantized to zero.
-      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
-        dc_test = 0;
-        break;
-      }
-
-    if (ac_test && dc_test) {
-      int skip_uv[2] = { 0 };
-      unsigned int var_uv[2];
-      unsigned int sse_uv[2];
-      AV1_COMMON *const cm = &cpi->common;
-      // Transform skipping test in UV planes.
-      for (int i = 1; i <= 2; i++) {
-        int j = i - 1;
-        skip_uv[j] = 1;
-        if (x->color_sensitivity[j]) {
-          skip_uv[j] = 0;
-          struct macroblock_plane *const puv = &x->plane[i];
-          struct macroblockd_plane *const puvd = &xd->plane[i];
-          const BLOCK_SIZE uv_bsize = get_plane_block_size(
-              bsize, puvd->subsampling_x, puvd->subsampling_y);
-          // Adjust these thresholds for UV.
-          const int64_t uv_dc_thr =
-              (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3;
-          const int64_t uv_ac_thr =
-              (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
-          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
-                                        i);
-          var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(
-              puv->src.buf, puv->src.stride, puvd->dst.buf, puvd->dst.stride,
-              &sse_uv[j]);
-          if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
-              (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
-            skip_uv[j] = 1;
-          else
-            break;
-        }
-      }
-      if (skip_uv[0] & skip_uv[1]) {
-        *early_term = 1;
-      }
-    }
-  }
-  if (calculate_rd) {
-    if (!*early_term) {
-      const int bwide = block_size_wide[bsize];
-      const int bhigh = block_size_high[bsize];
-
-      model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh,
-                            &rd_stats->rate, &rd_stats->dist);
-    }
-
-    if (*early_term) {
-      rd_stats->rate = 0;
-      rd_stats->dist = sse << 4;
+      sse_tx = sse16x16;
+      var_tx = var16x16;
+      num_blks = num_blks >> 2;
     }
+    set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+                                     early_term, num_blks, sse_tx, var_tx, sum,
+                                     var, sse);
   }
+  calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+                             sse);
 }
 
 static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
-                              RD_STATS *rd_stats, int calculate_rd) {
+                              RD_STATS *rd_stats, unsigned int *var_out,
+                              int calculate_rd, int *early_term) {
+  if (x->force_zeromv_skip_for_blk && early_term != NULL) {
+    *early_term = 1;
+    rd_stats->rate = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+  }
+
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -796,6 +1096,9 @@ static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
       p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
   int force_skip = 0;
   xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
+  if (var_out) {
+    *var_out = var;
+  }
 
   if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
     const int bwide = block_size_wide[bsize];
@@ -842,21 +1145,30 @@ static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
   }
 }
 
-#define DECLARE_LOOP_VARS_BLOCK_YRD()                                      \
+#define DECLARE_BLOCK_YRD_BUFFERS()                      \
+  DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \
+  DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]);  \
+  DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]);   \
+  uint16_t eob[1];
+
+#define DECLARE_BLOCK_YRD_VARS()                                           \
+  /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the   \
+   * entire bsize and write macroblock_plane::coeff. So low_coeff is kept  \
+   * as a non-const so we can reassign it to macroblock_plane::coeff. */   \
+  int16_t *low_coeff = (int16_t *)coeff_buf;                               \
+  int16_t *const low_qcoeff = (int16_t *)qcoeff_buf;                       \
+  int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf;                     \
   const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; \
-  const int block_offset = BLOCK_OFFSET(block + s);                        \
-  int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;           \
-  int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;         \
-  int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;       \
-  uint16_t *const eob = &p->eobs[block + s];                               \
-  const int diff_stride = bw;                                              \
+  const int diff_stride = bw;
+
+#define DECLARE_LOOP_VARS_BLOCK_YRD() \
   const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
 #if CONFIG_AV1_HIGHBITDEPTH
-#define DECLARE_HBD_LOOP_VARS_BLOCK_YRD()              \
-  tran_low_t *const coeff = p->coeff + block_offset;   \
-  tran_low_t *const qcoeff = p->qcoeff + block_offset; \
-  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+#define DECLARE_BLOCK_YRD_HBD_VARS()     \
+  tran_low_t *const coeff = coeff_buf;   \
+  tran_low_t *const qcoeff = qcoeff_buf; \
+  tran_low_t *const dqcoeff = dqcoeff_buf;
 
 static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
     MACROBLOCK *x, int *skippable, const int step, const int ncoeffs,
@@ -901,26 +1213,21 @@ static AOM_FORCE_INLINE void update_yrd_loop_vars(
  * \callergraph
  * Calculates RD Cost using Hadamard transform. For low bit depth this function
  * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
- * \param[in]    cpi            Top-level encoder structure
  * \param[in]    x              Pointer to structure holding all the data for
                                 the current macroblock
- * \param[in]    mi_row         Row index in 4x4 units
- * \param[in]    mi_col         Column index in 4x4 units
  * \param[in]    this_rdc       Pointer to calculated RD Cost
  * \param[in]    skippable      Pointer to a flag indicating possible tx skip
  * \param[in]    bsize          Current block size
  * \param[in]    tx_size        Transform size
- * \param[in]    tx_type        Transform kernel type
  * \param[in]    is_inter_mode  Flag to indicate inter mode
  *
- * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
  * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
  * coefficients for Hadamard transform
  */
-void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                   int mi_col, RD_STATS *this_rdc, int *skippable,
-                   BLOCK_SIZE bsize, TX_SIZE tx_size, TX_TYPE tx_type,
-                   int is_inter_mode) {
+static void block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+                      const BLOCK_SIZE bsize, const TX_SIZE tx_size,
+                      const int is_inter_mode) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -929,7 +1236,7 @@ void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
   const int num_4x4_h = mi_size_high[bsize];
   const int step = 1 << (tx_size << 1);
   const int block_step = (1 << tx_size);
-  const int row_step = step * num_4x4_w / block_step;
+  const int row_step = step * num_4x4_w >> tx_size;
   int block = 0;
   const int max_blocks_wide =
       num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
@@ -946,10 +1253,6 @@ void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
     sh_blk_skip = 1;
   }
 
-  (void)mi_row;
-  (void)mi_col;
-  (void)cpi;
-
 #if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd) {
     aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
@@ -963,193 +1266,160 @@ void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
                      pd->dst.buf, pd->dst.stride);
 #endif
 
-  *skippable = 1;
-  int tx_wd = 0;
-  switch (tx_size) {
-    case TX_64X64:
-      assert(0);  // Not implemented
-      break;
-    case TX_32X32:
-      assert(0);  // Not used
-      break;
-    case TX_16X16: tx_wd = 16; break;
-    case TX_8X8: tx_wd = 8; break;
-    default:
-      assert(tx_size == TX_4X4);
-      tx_wd = 4;
-      break;
-  }
-
+  // Keep the intermediate value on the stack here. Writing directly to
+  // skippable causes speed regression due to load-and-store issues in
+  // update_yrd_loop_vars.
+  int temp_skippable = 1;
   this_rdc->dist = 0;
   this_rdc->rate = 0;
-#if !CONFIG_AV1_HIGHBITDEPTH
-  if (tx_type == IDTX) {
-    // Keep track of the row and column of the blocks we use so that we know
-    // if we are in the unrestricted motion border.
-    for (int r = 0; r < max_blocks_high; r += block_step) {
-      for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
-        DECLARE_LOOP_VARS_BLOCK_YRD()
-
-        for (int idy = 0; idy < tx_wd; ++idy)
-          for (int idx = 0; idx < tx_wd; ++idx)
-            low_coeff[idy * tx_wd + idx] =
-                src_diff[idy * diff_stride + idx] * 8;
-
-        av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
-                        p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                        p->dequant_QTX, eob, scan_order->scan,
-                        scan_order->iscan);
-        assert(*eob <= 1024);
-        update_yrd_loop_vars(x, skippable, step, *eob, low_coeff, low_qcoeff,
-                             low_dqcoeff, this_rdc, &eob_cost,
-                             (r * num_blk_skip_w + c) >> sh_blk_skip);
-      }
-      block += row_step;
-    }
-  } else {
-#else
-  {
-    (void)tx_wd;
-#endif
-    // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
-    // can be done per function call. Hence the call of Hadamard txfm is
-    // abstracted here for the specified cases.
-    int is_tx_8x8_dual_applicable =
-        (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
-         block_size_high[bsize] >= 8);
+  // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
+  // can be done per function call. Hence the call of Hadamard txfm is
+  // abstracted here for the specified cases.
+  int is_tx_8x8_dual_applicable =
+      (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
+       block_size_high[bsize] >= 8);
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    // As of now, dual implementation of hadamard txfm is available for low
-    // bitdepth and when tx_type != IDTX.
-    if (use_hbd || tx_type == IDTX) is_tx_8x8_dual_applicable = 0;
+  // As of now, dual implementation of hadamard txfm is available for low
+  // bitdepth.
+  if (use_hbd) is_tx_8x8_dual_applicable = 0;
 #endif
 
-    if (is_tx_8x8_dual_applicable) {
-      aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide,
-                                   num_4x4_w, step, block_step);
-    }
+  if (is_tx_8x8_dual_applicable) {
+    aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
+                                 step, block_step);
+  }
 
-    // Keep track of the row and column of the blocks we use so that we know
-    // if we are in the unrestricted motion border.
-    for (int r = 0; r < max_blocks_high; r += block_step) {
-      for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
-        DECLARE_LOOP_VARS_BLOCK_YRD()
+  DECLARE_BLOCK_YRD_BUFFERS()
+  DECLARE_BLOCK_YRD_VARS()
 #if CONFIG_AV1_HIGHBITDEPTH
-        DECLARE_HBD_LOOP_VARS_BLOCK_YRD()
+  DECLARE_BLOCK_YRD_HBD_VARS()
 #else
-        (void)use_hbd;
+  (void)use_hbd;
 #endif
 
-        switch (tx_size) {
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+      DECLARE_LOOP_VARS_BLOCK_YRD()
+
+      switch (tx_size) {
 #if CONFIG_AV1_HIGHBITDEPTH
-          case TX_16X16:
-            if (use_hbd) {
-              aom_hadamard_16x16(src_diff, diff_stride, coeff);
-              av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
-                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                              scan_order->iscan);
-            } else {
-              if (tx_type == IDTX) {
-                aom_pixel_scale(src_diff, diff_stride, low_coeff, 3, 2, 2);
-              } else {
-                aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
-              }
-              av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
-                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                              p->dequant_QTX, eob, scan_order->scan,
-                              scan_order->iscan);
-            }
-            break;
-          case TX_8X8:
-            if (use_hbd) {
-              aom_hadamard_8x8(src_diff, diff_stride, coeff);
-              av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
-                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                              scan_order->iscan);
-            } else {
-              if (tx_type == IDTX) {
-                aom_pixel_scale(src_diff, diff_stride, low_coeff, 3, 1, 1);
-              } else if (!is_tx_8x8_dual_applicable) {
-                aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
-              } else {
-                assert(is_tx_8x8_dual_applicable);
-              }
-              av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX,
-                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                              p->dequant_QTX, eob, scan_order->scan,
-                              scan_order->iscan);
-            }
-            break;
-          default:
-            assert(tx_size == TX_4X4);
-            if (use_hbd) {
-              aom_fdct4x4(src_diff, coeff, diff_stride);
-              av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
-                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                              scan_order->iscan);
-            } else {
-              if (tx_type == IDTX) {
-                for (int idy = 0; idy < 4; ++idy)
-                  for (int idx = 0; idx < 4; ++idx)
-                    low_coeff[idy * 4 + idx] = src_diff[idy * diff_stride + idx]
-                                               << 3;
-              } else {
-                aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
-              }
-              av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX,
-                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                              p->dequant_QTX, eob, scan_order->scan,
-                              scan_order->iscan);
-            }
-            break;
-#else
-          case TX_16X16:
+        case TX_16X16:
+          if (use_hbd) {
+            aom_hadamard_16x16(src_diff, diff_stride, coeff);
+            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob,
+                            // default_scan_fp_16x16_transpose and
+                            // av1_default_iscan_fp_16x16_transpose have to be
+                            // used together.
+                            default_scan_fp_16x16_transpose,
+                            av1_default_iscan_fp_16x16_transpose);
+          } else {
             aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
             av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
                             p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                            p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
-            break;
-          case TX_8X8:
-            if (!is_tx_8x8_dual_applicable) {
-              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+                            p->dequant_QTX, eob,
+                            // default_scan_lp_16x16_transpose and
+                            // av1_default_iscan_lp_16x16_transpose have to be
+                            // used together.
+                            default_scan_lp_16x16_transpose,
+                            av1_default_iscan_lp_16x16_transpose);
+          }
+          break;
+        case TX_8X8:
+          if (use_hbd) {
+            aom_hadamard_8x8(src_diff, diff_stride, coeff);
+            av1_quantize_fp(
+                coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+                p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
+                default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+          } else {
+            if (is_tx_8x8_dual_applicable) {
+              // The coeffs are pre-computed for the whole block, so re-assign
+              // low_coeff to the appropriate location.
+              const int block_offset = BLOCK_OFFSET(block + s);
+              low_coeff = (int16_t *)p->coeff + block_offset;
             } else {
-              assert(is_tx_8x8_dual_applicable);
+              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
             }
-            av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
-                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
-                            scan_order->scan, scan_order->iscan);
-            break;
-          default:
+            av1_quantize_lp(
+                low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff,
+                low_dqcoeff, p->dequant_QTX, eob,
+                // default_scan_8x8_transpose and
+                // av1_default_iscan_8x8_transpose have to be used together.
+                default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+          }
+          break;
+        default:
+          assert(tx_size == TX_4X4);
+          // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate
+          // normal coefficients order, so we don't need to change the scan
+          // order here.
+          if (use_hbd) {
+            aom_fdct4x4(src_diff, coeff, diff_stride);
+            av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+          } else {
             aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
             av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
                             scan_order->scan, scan_order->iscan);
-            break;
+          }
+          break;
+#else
+        case TX_16X16:
+          aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+          av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX,
+                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                          default_scan_lp_16x16_transpose,
+                          av1_default_iscan_lp_16x16_transpose);
+          break;
+        case TX_8X8:
+          if (is_tx_8x8_dual_applicable) {
+            // The coeffs are pre-computed for the whole block, so re-assign
+            // low_coeff to the appropriate location.
+            const int block_offset = BLOCK_OFFSET(block + s);
+            low_coeff = (int16_t *)p->coeff + block_offset;
+          } else {
+            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+          }
+          av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
+                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                          default_scan_8x8_transpose,
+                          av1_default_iscan_8x8_transpose);
+          break;
+        default:
+          aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+          av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                          scan_order->scan, scan_order->iscan);
+          break;
 #endif
-        }
-        assert(*eob <= 1024);
+      }
+      assert(*eob <= 1024);
 #if CONFIG_AV1_HIGHBITDEPTH
-        if (use_hbd)
-          update_yrd_loop_vars_hbd(x, skippable, step, *eob, coeff, qcoeff,
-                                   dqcoeff, this_rdc, &eob_cost,
-                                   (r * num_blk_skip_w + c) >> sh_blk_skip);
-        else
+      if (use_hbd)
+        update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff,
+                                 dqcoeff, this_rdc, &eob_cost,
+                                 (r * num_blk_skip_w + c) >> sh_blk_skip);
+      else
 #endif
-          update_yrd_loop_vars(x, skippable, step, *eob, low_coeff, low_qcoeff,
-                               low_dqcoeff, this_rdc, &eob_cost,
-                               (r * num_blk_skip_w + c) >> sh_blk_skip);
-      }
-      block += row_step;
+        update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+                             low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+                             (r * num_blk_skip_w + c) >> sh_blk_skip);
     }
+    block += row_step;
   }
-  this_rdc->skip_txfm = *skippable;
+
+  this_rdc->skip_txfm = *skippable = temp_skippable;
   if (this_rdc->sse < INT64_MAX) {
     this_rdc->sse = (this_rdc->sse << 6) >> 2;
-    if (*skippable) {
+    if (temp_skippable) {
       this_rdc->dist = 0;
       this_rdc->dist = this_rdc->sse;
       return;
@@ -1161,6 +1431,131 @@ void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
   this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
 }
 
+// Explicitly enumerate the cases so the compiler can generate SIMD for the
+// function. According to the disassembler, gcc generates SSE codes for each of
+// the possible block sizes. The hottest case is tx_width 16, which takes up
+// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
+// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
+// potential room of improvement for writing AVX2 optimization is only 3% * 8% =
+// 0.24% of total encoding time.
+static AOM_INLINE void scale_square_buf_vals(int16_t *dst, const int tx_width,
+                                             const int16_t *src,
+                                             const int src_stride) {
+#define DO_SCALING                                                   \
+  do {                                                               \
+    for (int idy = 0; idy < tx_width; ++idy) {                       \
+      for (int idx = 0; idx < tx_width; ++idx) {                     \
+        dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
+      }                                                              \
+    }                                                                \
+  } while (0)
+
+  if (tx_width == 4) {
+    DO_SCALING;
+  } else if (tx_width == 8) {
+    DO_SCALING;
+  } else if (tx_width == 16) {
+    DO_SCALING;
+  } else {
+    assert(0);
+  }
+
+#undef DO_SCALING
+}
+
+/*!\brief Calculates RD Cost when the block uses Identity transform.
+ * Note that thie function is only for low bit depth encoding, since it
+ * is called in real-time mode for now, which sets high bit depth to 0:
+ * -DCONFIG_AV1_HIGHBITDEPTH=0
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    this_rdc       Pointer to calculated RD Cost
+ * \param[in]    skippable      Pointer to a flag indicating possible tx skip
+ * \param[in]    bsize          Current block size
+ * \param[in]    tx_size        Transform size
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if all coefficients are zero.
+ */
+static void block_yrd_idtx(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+                           const BLOCK_SIZE bsize, const TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int num_4x4_w = mi_size_wide[bsize];
+  const int num_4x4_h = mi_size_high[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+  int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
+  const int num_blk_skip_w = num_4x4_w >> 1;
+  const int sh_blk_skip = 1;
+  // Keep the intermediate value on the stack here. Writing directly to
+  // skippable causes speed regression due to load-and-store issues in
+  // update_yrd_loop_vars.
+  int temp_skippable = 1;
+  int tx_wd = 0;
+  switch (tx_size) {
+    case TX_64X64:
+      assert(0);  // Not implemented
+      break;
+    case TX_32X32:
+      assert(0);  // Not used
+      break;
+    case TX_16X16: tx_wd = 16; break;
+    case TX_8X8: tx_wd = 8; break;
+    default:
+      assert(tx_size == TX_4X4);
+      tx_wd = 4;
+      break;
+  }
+  this_rdc->dist = 0;
+  this_rdc->rate = 0;
+  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  DECLARE_BLOCK_YRD_BUFFERS()
+  DECLARE_BLOCK_YRD_VARS()
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+      DECLARE_LOOP_VARS_BLOCK_YRD()
+      scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
+      av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
+                      p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
+                      eob, scan_order->scan, scan_order->iscan);
+      assert(*eob <= 1024);
+      update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+                           low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+                           (r * num_blk_skip_w + c) >> sh_blk_skip);
+    }
+  }
+  this_rdc->skip_txfm = *skippable = temp_skippable;
+  if (this_rdc->sse < INT64_MAX) {
+    this_rdc->sse = (this_rdc->sse << 6) >> 2;
+    if (temp_skippable) {
+      this_rdc->dist = 0;
+      this_rdc->dist = this_rdc->sse;
+      return;
+    }
+  }
+  // If skippable is set, rate gets clobbered later.
+  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
 static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode,
                              MV_REFERENCE_FRAME ref_frame0,
                              MV_REFERENCE_FRAME ref_frame1,
@@ -1337,10 +1732,10 @@ static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
   }
 }
 
-static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
-                               MACROBLOCK *x, MACROBLOCKD *xd,
-                               RD_STATS *this_rdc, int64_t *sse_y,
-                               int start_plane, int stop_plane) {
+static int64_t model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+                                  MACROBLOCK *x, MACROBLOCKD *xd,
+                                  RD_STATS *this_rdc, int start_plane,
+                                  int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -1348,7 +1743,7 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
   int rate;
   int64_t dist;
   int i;
-  int64_t tot_sse = *sse_y;
+  int64_t tot_sse = 0;
 
   this_rdc->rate = 0;
   this_rdc->dist = 0;
@@ -1392,7 +1787,7 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
     this_rdc->skip_txfm = 1;
   }
 
-  *sse_y = tot_sse;
+  return tot_sse;
 }
 
 /*!\cond */
@@ -1418,10 +1813,10 @@ struct estimate_block_intra_args {
  * \param[in]    col            Column of a current TX block
  * \param[in]    plane_bsize    Block size of a current prediction block
  * \param[in]    tx_size        Transform size
- * \param[in]    arg            Pointer to a structure that holds paramaters
+ * \param[in]    arg            Pointer to a structure that holds parameters
  *                              for intra mode search
  *
- * \return Nothing is returned. Instead, best mode and RD Cost of the best mode
+ * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode
  * are set in \c args->rdc and \c args->mode
  */
 static void estimate_block_intra(int plane, int block, int row, int col,
@@ -1451,11 +1846,10 @@ static void estimate_block_intra(int plane, int block, int row, int col,
   pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
 
   if (plane == 0) {
-    av1_block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
-                  AOMMIN(tx_size, TX_16X16), DCT_DCT, 0);
+    block_yrd(x, &this_rdc, &args->skippable, bsize_tx,
+              AOMMIN(tx_size, TX_16X16), 0);
   } else {
-    int64_t sse = 0;
-    model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &sse, plane, plane);
+    model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane);
   }
 
   p->src.buf = src_buf_base;
@@ -1508,7 +1902,7 @@ static void recheck_zeromv_after_denoising(
     BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) {
   // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
   // denoised result. Only do this under noise conditions, and if rdcost of
-  // ZEROMV onoriginal source is not significantly higher than rdcost of best
+  // ZEROMV on original source is not significantly higher than rdcost of best
   // mode.
   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
       ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
@@ -1531,7 +1925,8 @@ static void recheck_zeromv_after_denoising(
     mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
     xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
-    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
+    unsigned int var;
+    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL);
 
     const int16_t mode_ctx =
         av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
@@ -1572,7 +1967,7 @@ static void recheck_zeromv_after_denoising(
 
 #define FILTER_SEARCH_SIZE 2
 
-/*!\brief Searches for the best intrpolation filter
+/*!\brief Searches for the best interpolation filter
  *
  * \ingroup nonrd_mode_search
  * \callgraph
@@ -1587,9 +1982,11 @@ static void recheck_zeromv_after_denoising(
  * \param[in]    x                    Pointer to structure holding all the
  *                                    data for the current macroblock
  * \param[in]    this_rdc             Pointer to calculated RD Cost
+ * \param[in]    inter_pred_params_sr Pointer to structure holding parameters of
+                                      inter prediction for single reference
  * \param[in]    mi_row               Row index in 4x4 units
  * \param[in]    mi_col               Column index in 4x4 units
- * \param[in]    tmp                  Pointer to a temporary buffer for
+ * \param[in]    tmp_buffer           Pointer to a temporary buffer for
  *                                    prediction re-use
  * \param[in]    bsize                Current block size
  * \param[in]    reuse_inter_pred     Flag, indicating prediction re-use
@@ -1597,22 +1994,27 @@ static void recheck_zeromv_after_denoising(
  *                                    for prediction re-use
  * \param[out]   this_early_term      Flag, indicating that transform can be
  *                                    skipped
+ * \param[out]   var                  The residue variance of the current
+ *                                    predictor.
  * \param[in]    use_model_yrd_large  Flag, indicating special logic to handle
  *                                    large blocks
  * \param[in]    best_sse             Best sse so far.
+ * \param[in]    comp_pred            Flag, indicating compound mode.
  *
- * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
  * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
- * \c reuse_inter_pred flag is set, this function also ouputs
+ * \c reuse_inter_pred flag is set, this function also outputs
  * \c this_mode_pred. Also \c this_early_temp is set if transform can be
  * skipped
  */
 static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
-                              int mi_row, int mi_col, PRED_BUFFER *tmp,
+                              InterPredParams *inter_pred_params_sr, int mi_row,
+                              int mi_col, PRED_BUFFER *tmp_buffer,
                               BLOCK_SIZE bsize, int reuse_inter_pred,
                               PRED_BUFFER **this_mode_pred,
-                              int *this_early_term, int use_model_yrd_large,
-                              int64_t best_sse) {
+                              int *this_early_term, unsigned int *var,
+                              int use_model_yrd_large, int64_t best_sse,
+                              int comp_pred) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -1627,6 +2029,14 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
   int best_early_term = 0;
   int64_t best_cost = INT64_MAX;
   int best_filter_index = -1;
+
+  SubpelParams subpel_params;
+  // Initialize inter prediction params at mode level for single reference
+  // mode.
+  if (!comp_pred)
+    init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr,
+                           &subpel_params, xd->block_ref_scale_factors[0],
+                           pd->pre->width, pd->pre->height);
   for (int i = 0; i < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; ++i) {
     int64_t cost;
     if (cpi->sf.interp_sf.disable_dual_filter &&
@@ -1634,18 +2044,24 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
       continue;
     mi->interp_filters.as_filters.x_filter = filters_ref_set[i].filter_x;
     mi->interp_filters.as_filters.y_filter = filters_ref_set[i].filter_y;
-    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    if (!comp_pred)
+      av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+                                            &subpel_params);
+    else
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+    unsigned int curr_var = UINT_MAX;
     if (use_model_yrd_large)
       model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
                                 &pf_rd_stats[i], this_early_term, 1, best_sse,
-                                NULL, UINT_MAX);
+                                &curr_var, UINT_MAX);
     else
-      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], &curr_var, 1, NULL);
     pf_rd_stats[i].rate += av1_get_switchable_rate(
         x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
     cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
     pf_tx_size[i] = mi->tx_size;
     if (cost < best_cost) {
+      *var = curr_var;
       best_filter_index = i;
       best_cost = cost;
       best_skip = pf_rd_stats[i].skip_txfm;
@@ -1655,7 +2071,7 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
           free_pred_buffer(*this_mode_pred);
           *this_mode_pred = current_pred;
         }
-        current_pred = &tmp[get_pred_buffer(tmp, 3)];
+        current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
         pd->dst.buf = current_pred->data;
         pd->dst.stride = bw;
       }
@@ -1680,7 +2096,11 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
     pd->dst.buf = (*this_mode_pred)->data;
     pd->dst.stride = (*this_mode_pred)->stride;
   } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
-    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    if (!comp_pred)
+      av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+                                            &subpel_params);
+    else
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
   }
 }
 #if !CONFIG_REALTIME_ONLY
@@ -1786,7 +2206,7 @@ static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
                                   &pf_rd_stats[i], this_early_term, 1, best_sse,
                                   NULL, UINT_MAX);
       else
-        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1, NULL);
       pf_rd_stats[i].rate +=
           av1_get_switchable_rate(x, xd, cm->features.interp_filter,
                                   cm->seq_params->enable_dual_filter);
@@ -1849,7 +2269,7 @@ static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
                                     &pf_rd_stats[i], this_early_term, 1,
                                     best_sse, NULL, UINT_MAX);
         else
-          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1, NULL);
 
         pf_rd_stats[i].rate +=
             mode_costs->motion_mode_cost[bsize][mi->motion_mode];
@@ -1881,11 +2301,13 @@ static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
 #endif  // !CONFIG_REALTIME_ONLY
 
 #define COLLECT_PICK_MODE_STAT 0
+#define COLLECT_NON_SQR_STAT 0
 
 #if COLLECT_PICK_MODE_STAT
+#include "aom_ports/aom_timer.h"
 typedef struct _mode_search_stat {
   int32_t num_blocks[BLOCK_SIZES];
-  int64_t avg_block_times[BLOCK_SIZES];
+  int64_t total_block_times[BLOCK_SIZES];
   int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
   int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
   int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
@@ -1896,8 +2318,77 @@ typedef struct _mode_search_stat {
   int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT];
   struct aom_usec_timer timer1;
   struct aom_usec_timer timer2;
-  struct aom_usec_timer timer3;
+  struct aom_usec_timer bsize_timer;
 } mode_search_stat;
+
+static mode_search_stat ms_stat;
+
+static AOM_INLINE void print_stage_time(const char *stage_name,
+                                        int64_t stage_time,
+                                        int64_t total_time) {
+  printf("    %s: %ld (%f%%)\n", stage_name, stage_time,
+         100 * stage_time / (float)total_time);
+}
+
+static void print_time(const mode_search_stat *const ms_stat,
+                       const BLOCK_SIZE bsize, const int mi_rows,
+                       const int mi_cols, const int mi_row, const int mi_col) {
+  if ((mi_row + mi_size_high[bsize] >= mi_rows) &&
+      (mi_col + mi_size_wide[bsize] >= mi_cols)) {
+    int64_t total_time = 0l;
+    int32_t total_blocks = 0;
+    for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+      total_time += ms_stat->total_block_times[bs];
+      total_blocks += ms_stat->num_blocks[bs];
+    }
+
+    printf("\n");
+    for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+      if (ms_stat->num_blocks[bs] == 0) {
+        continue;
+      }
+      if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) {
+        continue;
+      }
+
+      printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n",
+             block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs],
+             ms_stat->total_block_times[bs],
+             100 * ms_stat->total_block_times[bs] / (float)total_time,
+             (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]);
+      for (int j = 0; j < MB_MODE_COUNT; j++) {
+        if (ms_stat->nonskipped_search_times[bs][j] == 0) {
+          continue;
+        }
+
+        int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j];
+        printf("  Mode %d, %d/%d tps %f\n", j,
+               ms_stat->num_nonskipped_searches[bs][j],
+               ms_stat->num_searches[bs][j],
+               ms_stat->num_nonskipped_searches[bs][j] > 0
+                   ? (float)ms_stat->nonskipped_search_times[bs][j] /
+                         ms_stat->num_nonskipped_searches[bs][j]
+                   : 0l);
+        if (j >= INTER_MODE_START) {
+          total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] +
+                            ms_stat->model_rd_time[bs][j] +
+                            ms_stat->txfm_time[bs][j];
+          print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j],
+                           total_time);
+          print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j],
+                           total_time);
+          print_stage_time("Model    RD   Time", ms_stat->model_rd_time[bs][j],
+                           total_time);
+          print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j],
+                           total_time);
+        }
+        print_stage_time("Total  Mode   Time", total_mode_time, total_time);
+      }
+      printf("\n");
+    }
+    printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
+  }
+}
 #endif  // COLLECT_PICK_MODE_STAT
 
 static void compute_intra_yprediction(const AV1_COMMON *cm,
@@ -1972,6 +2463,17 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
   // mode tests.
   for (int i = 0; i < 4; ++i) {
     PREDICTION_MODE this_mode = intra_mode_list[i];
+
+    // As per the statistics generated for intra mode evaluation in the nonrd
+    // path, it is found that the probability of H_PRED mode being the winner is
+    // very less when the best mode so far is V_PRED (out of DC_PRED and
+    // V_PRED). If V_PRED is the winner mode out of DC_PRED and V_PRED, it could
+    // imply the presence of a vertically dominant pattern. Hence, H_PRED mode
+    // is not evaluated.
+    if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far &&
+        this_mode == H_PRED && best_mode == V_PRED)
+      continue;
+
     this_rdc.dist = this_rdc.rate = 0;
     args.mode = this_mode;
     args.skippable = 1;
@@ -2029,22 +2531,28 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
   const struct segmentation *const seg = &cm->seg;
   const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
 
-  // For SVC the usage of alt_ref is determined by the ref_frame_flags.
-  int use_alt_ref_frame =
-      cpi->ppi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
+  // When the ref_frame_config is used to set the reference frame structure
+  // then the usage of alt_ref is determined by the ref_frame_flags
+  // (and not the speed feature use_nonrd_altref_frame).
+  int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config ||
+                          cpi->sf.rt_sf.use_nonrd_altref_frame;
+
   int use_golden_ref_frame = 1;
   int use_last_ref_frame = 1;
 
-  if (cpi->ppi->use_svc)
+  // When the ref_frame_config is used to set the reference frame structure:
+  // check if LAST is used as a reference. And only remove golden and altref
+  // references below if last is used as a reference.
+  if (cpi->ppi->rtc_ref.set_ref_frame_config)
     use_last_ref_frame =
         cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0;
 
-  // Only remove golden and altref reference below if last is a reference,
-  // which may not be the case for svc.
-  if (use_last_ref_frame && cpi->rc.frames_since_golden == 0 &&
-      gf_temporal_ref) {
+  // frame_since_golden is not used when user sets the referene structure.
+  if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame &&
+      cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
     use_golden_ref_frame = 0;
   }
+
   if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var &&
       x->nonrd_prune_ref_frame_search) {
     if (is_small_sb)
@@ -2061,18 +2569,10 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
   }
 
   if (use_last_ref_frame &&
-      (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip ||
+      (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk ||
        (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
     use_golden_ref_frame = 0;
     use_alt_ref_frame = 0;
-    // Keep golden (longer-term) reference if sb has high source sad, for
-    // frames whose average souce_sad is below threshold. This is to try to
-    // capture case where only part of frame has high motion.
-    // Exclude screen content mode.
-    if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
-        x->content_state_sb.source_sad_nonrd >= kHighSad &&
-        bsize <= BLOCK_32X32 && cpi->rc.frame_source_sad < 50000)
-      use_golden_ref_frame = 1;
   }
 
   if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
@@ -2082,16 +2582,50 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
   }
 
   // Skip golden reference if color is set, on flat blocks with motion.
-  if (x->source_variance < 500 &&
-      x->content_state_sb.source_sad_nonrd > kLowSad &&
+  // For screen: always skip golden (if color_sensitivity_sb_g is set)
+  // except when x->nonrd_prune_ref_frame_search = 0. This latter flag
+  // may be set in the variance partition when golden is a much better
+  // reference than last, in which case it may not be worth skipping
+  // golden completely.
+  if (((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+        x->nonrd_prune_ref_frame_search != 0) ||
+       (x->source_variance < 500 &&
+        x->content_state_sb.source_sad_nonrd > kLowSad)) &&
       (x->color_sensitivity_sb_g[0] == 1 || x->color_sensitivity_sb_g[1] == 1))
     use_golden_ref_frame = 0;
 
+  // For non-screen: if golden and altref are not being selected as references
+  // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back
+  // based on the sad of nearest/nearmv of LAST ref. If this block sad is large,
+  // keep golden as reference. Only do this for the agrressive pruning mode and
+  // avoid it when color is set for golden reference.
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+      (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame &&
+      !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX &&
+      x->nonrd_prune_ref_frame_search > 2 &&
+      x->color_sensitivity_sb_g[0] == 0 && x->color_sensitivity_sb_g[1] == 0) {
+    int thr = (cm->width * cm->height >= 640 * 360) ? 100 : 150;
+    int pred = x->pred_mv_sad[LAST_FRAME] >>
+               (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (pred > thr) use_golden_ref_frame = 1;
+  }
+
   use_alt_ref_frame =
       cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
   use_golden_ref_frame =
       cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
 
+  // For spatial layers: enable golden ref if it is set by user and
+  // corresponds to the lower spatial layer.
+  if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+      x->content_state_sb.source_sad_nonrd < kHighSad) {
+    const int buffslot_golden =
+        cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME];
+    if (cpi->svc.buffer_time_index[buffslot_golden] ==
+        cpi->svc.current_superframe)
+      use_golden_ref_frame = 1;
+  }
+
   use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
   use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
   use_ref_frame[LAST_FRAME] = use_last_ref_frame;
@@ -2102,6 +2636,36 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
   assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
 }
 
+// Checks whether Intra mode needs to be pruned based on
+// 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
+// speed features.
+static INLINE bool is_prune_intra_mode(AV1_COMP *cpi, int mode_index,
+                                       int force_intra_check, BLOCK_SIZE bsize,
+                                       uint8_t segment_id,
+                                       SOURCE_SAD source_sad_nonrd,
+                                       uint8_t color_sensitivity[2]) {
+  const PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+  if (mode_index > 2 || force_intra_check == 0) {
+    if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
+      return true;
+
+    if (this_mode == DC_PRED) return false;
+
+    if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false;
+
+    const bool has_color_sensitivity =
+        color_sensitivity[0] && color_sensitivity[1];
+    if (has_color_sensitivity &&
+        (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad ||
+         cyclic_refresh_segment_id_boosted(segment_id) ||
+         source_sad_nonrd > kMedSad))
+      return false;
+
+    return true;
+  }
+  return false;
+}
+
 /*!\brief Estimates best intra mode for inter mode search
  *
  * \ingroup nonrd_mode_search
@@ -2116,8 +2680,6 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
  * \param[in]    x                        Pointer to structure holding all the
  *                                        data for the current macroblock
  * \param[in]    bsize                    Current block size
- * \param[in]    use_modeled_non_rd_cost  Flag, indicating usage of curvfit
- *                                        model for RD cost
  * \param[in]    best_early_term          Flag, indicating that TX for the
  *                                        best inter mode was skipped
  * \param[in]    ref_cost_intra           Cost of signalling intra mode
@@ -2134,14 +2696,13 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
  * \param[in]    ctx                      Pointer to structure holding coding
  *                                        contexts and modes for the block
  *
- * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
  * \c best_rdc and best selected mode is placed to \c best_pickmode
  */
 static void estimate_intra_mode(
-    AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int use_modeled_non_rd_cost,
-    int best_early_term, unsigned int ref_cost_intra, int reuse_prediction,
-    struct buf_2d *orig_dst, PRED_BUFFER *tmp_buffers,
-    PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+    AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int best_early_term,
+    unsigned int ref_cost_intra, int reuse_prediction, struct buf_2d *orig_dst,
+    PRED_BUFFER *tmp_buffers, PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
     BEST_PICKMODE *best_pickmode, PICK_MODE_CONTEXT *ctx) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2150,8 +2711,8 @@ static void estimate_intra_mode(
   const unsigned char segment_id = mi->segment_id;
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
+  const bool is_screen_content =
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
   struct macroblockd_plane *const pd = &xd->plane[0];
 
   const CommonQuantParams *quant_params = &cm->quant_params;
@@ -2161,10 +2722,11 @@ static void estimate_intra_mode(
   int intra_cost_penalty = av1_get_intra_cost_penalty(
       quant_params->base_qindex, quant_params->y_dc_delta_q,
       cm->seq_params->bit_depth);
-  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+  int64_t inter_mode_thresh =
+      RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
   int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
   int force_intra_check = 0;
-  // For spatial enhancemanent layer: turn off intra prediction if the
+  // For spatial enhancement layer: turn off intra prediction if the
   // previous spatial layer as golden ref is not chosen as best reference.
   // only do this for temporal enhancement layer and on non-key frames.
   if (cpi->svc.spatial_layer_id > 0 &&
@@ -2195,13 +2757,13 @@ static void estimate_intra_mode(
          abs(mi->mv[0].as_mv.row) >= motion_thresh ||
          abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
       intra_cost_penalty = intra_cost_penalty >> 2;
-      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+      inter_mode_thresh =
+          RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
       do_early_exit_rdthresh = 0;
     }
     if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
          x->content_state_sb.source_sad_nonrd >= kHighSad) ||
-        (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-         x->source_variance == 0 &&
+        (is_screen_content && x->source_variance < 50 &&
          ((bsize >= BLOCK_32X32 &&
            x->content_state_sb.source_sad_nonrd != kZeroSad) ||
           x->color_sensitivity[0] == 1 || x->color_sensitivity[1] == 1)))
@@ -2210,7 +2772,7 @@ static void estimate_intra_mode(
     // even if best_early_term is set.
     if (bsize >= BLOCK_32X32) best_early_term = 0;
   } else if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
-             x->content_state_sb.source_sad_nonrd == kLowSad) {
+             x->content_state_sb.source_sad_nonrd <= kLowSad) {
     perform_intra_pred = 0;
   }
 
@@ -2223,19 +2785,25 @@ static void estimate_intra_mode(
 
   if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
         (perform_intra_pred && !best_early_term &&
-         best_rdc->rdcost > inter_mode_thresh &&
          bsize <= cpi->sf.part_sf.max_intra_bsize))) {
     return;
   }
 
+  // Early exit based on RD cost calculated using known rate. When
+  // is_screen_content is true, more bias is given to intra modes. Hence,
+  // considered conservative threshold in early exit for the same.
+  const int64_t known_rd = is_screen_content
+                               ? CALC_BIASED_RDCOST(inter_mode_thresh)
+                               : inter_mode_thresh;
+  if (known_rd > best_rdc->rdcost) return;
+
   struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
   TX_SIZE intra_tx_size = AOMMIN(
       AOMMIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
       TX_16X16);
-  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-      cpi->rc.high_source_sad && x->source_variance > spatial_var_thresh &&
-      bsize <= BLOCK_16X16)
+  if (is_screen_content && cpi->rc.high_source_sad &&
+      x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16)
     intra_tx_size = TX_4X4;
 
   PRED_BUFFER *const best_pred = best_pickmode->best_pred;
@@ -2257,22 +2825,19 @@ static void estimate_intra_mode(
     const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
     const int64_t mode_rd_thresh = rd_threshes[mode_index];
 
-    if (i > 2 || !(force_intra_check == 1 &&
-                   best_pickmode->best_ref_frame != INTRA_FRAME)) {
-      if (!((1 << this_mode) &
-            cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
-        continue;
-    }
+    if (is_prune_intra_mode(cpi, i, force_intra_check, bsize, segment_id,
+                            x->content_state_sb.source_sad_nonrd,
+                            x->color_sensitivity))
+      continue;
 
-    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-        cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+    if (is_screen_content && cpi->sf.rt_sf.source_metrics_sb_nonrd) {
       // For spatially flat blocks with zero motion only check
       // DC mode.
       if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
           x->source_variance == 0 && this_mode != DC_PRED)
         continue;
-      // Only test Intra for big blocks if spatial_variance is 0.
-      else if (bsize > BLOCK_32X32 && x->source_variance > 0)
+      // Only test Intra for big blocks if spatial_variance is small.
+      else if (bsize > BLOCK_32X32 && x->source_variance > 50)
         continue;
     }
 
@@ -2295,11 +2860,7 @@ static void estimate_intra_mode(
     mi->tx_size = intra_tx_size;
     compute_intra_yprediction(cm, this_mode, bsize, x, xd);
     // Look into selecting tx_size here, based on prediction residual.
-    if (use_modeled_non_rd_cost)
-      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
-    else
-      av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, bsize,
-                    mi->tx_size, DCT_DCT, 0);
+    block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size, 0);
     // TODO(kyslov@) Need to account for skippable
     if (x->color_sensitivity[0]) {
       av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
@@ -2325,13 +2886,12 @@ static void estimate_intra_mode(
     this_rdc.rate += mode_cost;
     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
 
-    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-        cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+    if (is_screen_content && cpi->sf.rt_sf.source_metrics_sb_nonrd) {
       // For blocks with low spatial variance and color sad,
       // favor the intra-modes, only on scene/slide change.
       if (cpi->rc.high_source_sad && x->source_variance < 800 &&
           (x->color_sensitivity[0] || x->color_sensitivity[1]))
-        this_rdc.rdcost = (7 * this_rdc.rdcost) >> 3;
+        this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost);
       // Otherwise bias against intra for blocks with zero
       // motion and no color, on non-scene/slide changes.
       else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
@@ -2359,25 +2919,44 @@ static void estimate_intra_mode(
   mi->tx_size = best_pickmode->best_tx_size;
 }
 
-static AOM_INLINE int is_filter_search_enabled(const AV1_COMP *cpi, int mi_row,
-                                               int mi_col, BLOCK_SIZE bsize,
-                                               int segment_id) {
+static AOM_INLINE int is_filter_search_enabled_blk(
+    AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize,
+    int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) {
   const AV1_COMMON *const cm = &cpi->common;
-  int enable_filter_search = 0;
-
-  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
-    enable_filter_search = 1;
-    if (cpi->sf.interp_sf.cb_pred_filter_search) {
-      const int bsl = mi_size_wide_log2[bsize];
-      enable_filter_search =
-          (((mi_row + mi_col) >> bsl) +
-           get_chessboard_index(cm->current_frame.frame_number)) &
-          0x1;
-      if (cyclic_refresh_segment_id_boosted(segment_id))
-        enable_filter_search = 1;
-    }
+  // filt search disabled
+  if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0;
+  // filt search purely based on mode properties
+  if (!cb_pred_filter_search) return 1;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int enable_interp_search = 0;
+  if (!(xd->left_mbmi && xd->above_mbmi)) {
+    // neighbors info unavailable
+    enable_interp_search = 2;
+  } else if (!(is_inter_block(xd->left_mbmi) &&
+               is_inter_block(xd->above_mbmi))) {
+    // neighbor is INTRA
+    enable_interp_search = 2;
+  } else if (xd->left_mbmi->interp_filters.as_int !=
+             xd->above_mbmi->interp_filters.as_int) {
+    // filters are different
+    enable_interp_search = 2;
+  } else if ((cb_pred_filter_search == 1) &&
+             (xd->left_mbmi->interp_filters.as_filters.x_filter !=
+              EIGHTTAP_REGULAR)) {
+    // not regular
+    enable_interp_search = 2;
+  } else {
+    // enable prediction based on chessboard pattern
+    if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH)
+      *filt_select = EIGHTTAP_SMOOTH;
+    const int bsl = mi_size_wide_log2[bsize];
+    enable_interp_search =
+        (bool)((((mi_row + mi_col) >> bsl) +
+                get_chessboard_index(cm->current_frame.frame_number)) &
+               0x1);
+    if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1;
   }
-  return enable_filter_search;
+  return enable_interp_search;
 }
 
 static AOM_INLINE int skip_mode_by_threshold(
@@ -2447,9 +3026,10 @@ static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
   return 0;
 }
 
-void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                           int y_sad, unsigned int source_variance,
-                           struct buf_2d yv12_mb[MAX_MB_PLANE]) {
+static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int y_sad,
+                                  unsigned int source_variance,
+                                  struct buf_2d yv12_mb[MAX_MB_PLANE]) {
   const int subsampling_x = cpi->common.seq_params->subsampling_x;
   const int subsampling_y = cpi->common.seq_params->subsampling_y;
   int factor = (bsize >= BLOCK_32X32) ? 2 : 3;
@@ -2470,12 +3050,13 @@ void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   if (cpi->noise_estimate.enabled)
     noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
   if (noise_level == kLow && source_variance > thresh_spatial &&
-      norm_sad < 50) {
+      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) {
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
     return;
   }
-  for (int i = 1; i <= 2; ++i) {
+  const int num_planes = av1_num_planes(&cpi->common);
+  for (int i = 1; i < num_planes; ++i) {
     if (x->color_sensitivity[i - 1] == 2 || source_variance < 50) {
       struct macroblock_plane *const p = &x->plane[i];
       const BLOCK_SIZE bs =
@@ -2494,10 +3075,11 @@ void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   }
 }
 
-void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x,
-                               struct buf_2d yv12_mb[8][MAX_MB_PLANE],
-                               const int *use_ref_frame_mask,
-                               const MV_REFERENCE_FRAME *rf, int *ref_mv_idx) {
+static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x,
+                                      struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+                                      const int *use_ref_frame_mask,
+                                      const MV_REFERENCE_FRAME *rf,
+                                      int *ref_mv_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
@@ -2550,73 +3132,37 @@ static void set_compound_mode(MACROBLOCK *x, int ref_frame, int ref_frame2,
   }
 }
 
-static int skip_comp_based_on_sad(AV1_COMP *cpi, MACROBLOCK *x,
-                                  const int mi_row, const int mi_col,
-                                  BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &cpi->common;
-  assert(!(mi_row % 16) && !(mi_col % 16));
-  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
-                                ? (cm->seq_params->mib_size >> 1)
-                                : cm->seq_params->mib_size;
-  const int sb_cols =
-      (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
-  const uint64_t sad_skp_comp_th[2][3] = { { 2700, 3100 },    // CPU 9
-                                           { 2700, 3200 } };  // CPU 10
-  const uint64_t sad_blkwise_var_th = 5000;
-  const float qindex_th_scale[5] = { 0.75f, 0.9f, 1.0f, 1.1f, 1.25f };
-  const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
-  assert(qindex_band < 5);
-  const int sp_idx = (cpi->sf.rt_sf.sad_based_comp_prune >= 2);
-  const int bsize_idx = (bsize == BLOCK_128X128);
-  const uint64_t sad_skp_comp_th_val = (uint64_t)(
-      sad_skp_comp_th[sp_idx][bsize_idx] * qindex_th_scale[qindex_band]);
-  uint64_t blk_sad = 0, sad00, sad01, sad10, sad11, min_sad, max_sad;
-  const int sbi_col = mi_col / 16;
-  const int sbi_row = mi_row / 16;
-  const uint64_t *cur_blk_sad =
-      &cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
-
-  if (bsize == BLOCK_128X128) {
-    sad00 = cur_blk_sad[0];
-    sad01 = cur_blk_sad[1];
-    sad10 = cur_blk_sad[sb_cols];
-    sad11 = cur_blk_sad[1 + sb_cols];
-    min_sad = AOMMIN(AOMMIN(AOMMIN(sad00, sad01), sad10), sad11);
-    max_sad = AOMMAX(AOMMAX(AOMMAX(sad00, sad01), sad10), sad11);
-    if (max_sad - min_sad > sad_blkwise_var_th) return 0;
-    blk_sad = (sad00 + sad01 + sad10 + sad11 + 2) >> 2;
-  } else if (bsize == BLOCK_128X64) {
-    sad00 = cur_blk_sad[0];
-    sad01 = cur_blk_sad[1];
-    min_sad = AOMMIN(sad00, sad01);
-    max_sad = AOMMAX(sad00, sad01);
-    if (max_sad - min_sad > sad_blkwise_var_th) return 0;
-    blk_sad = (sad00 + sad01 + 1) >> 1;
-  } else if (bsize == BLOCK_64X128) {
-    sad00 = cur_blk_sad[0];
-    sad10 = cur_blk_sad[sb_cols];
-    min_sad = AOMMIN(sad00, sad10);
-    max_sad = AOMMAX(sad00, sad10);
-    if (max_sad - min_sad > sad_blkwise_var_th) return 0;
-    blk_sad = (sad00 + sad10 + 1) >> 1;
-  } else if (bsize <= BLOCK_64X64) {
-    blk_sad = cur_blk_sad[0];
-  } else {
-    assert(0);
+// Prune compound mode if the single mode variance is lower than a fixed
+// percentage of the median value.
+static bool skip_comp_based_on_var(
+    const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) {
+  unsigned int best_var = UINT_MAX;
+  for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) {
+    for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+      best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]);
+    }
+  }
+  const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659);
+  const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281);
+
+  // Currently, the thresh for 128 and 16 are not well-tuned. We are using the
+  // results from 64 and 32 as an heuristic.
+  switch (bsize) {
+    case BLOCK_128X128: return best_var < 4 * thresh_64;
+    case BLOCK_64X64: return best_var < thresh_64;
+    case BLOCK_32X32: return best_var < thresh_32;
+    case BLOCK_16X16: return best_var < thresh_32 / 4;
+    default: return false;
   }
-
-  if (blk_sad < sad_skp_comp_th_val) return 1;
-
-  return 0;
 }
 
 static AOM_FORCE_INLINE void fill_single_inter_mode_costs(
     int (*single_inter_mode_costs)[REF_FRAMES], const int num_inter_modes,
-    const REF_MODE *ref_mode_set, const ModeCosts *mode_costs,
+    const REF_MODE *reference_mode_set, const ModeCosts *mode_costs,
     const int16_t *mode_context) {
   bool ref_frame_used[REF_FRAMES] = { false };
   for (int idx = 0; idx < num_inter_modes; idx++) {
-    ref_frame_used[ref_mode_set[idx].ref_frame] = true;
+    ref_frame_used[reference_mode_set[idx].ref_frame] = true;
   }
 
   for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES;
@@ -2700,6 +3246,303 @@ static AOM_INLINE int setup_compound_params_from_comp_idx(
   return 1;
 }
 
+static AOM_INLINE bool previous_mode_performed_poorly(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+    const unsigned int (*vars)[REF_FRAMES],
+    const int64_t (*uv_dist)[REF_FRAMES]) {
+  unsigned int best_var = UINT_MAX;
+  int64_t best_uv_dist = INT64_MAX;
+  for (int midx = 0; midx < RTC_INTER_MODES; midx++) {
+    best_var = AOMMIN(best_var, vars[midx][ref_frame]);
+    best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]);
+  }
+  assert(best_var != UINT_MAX && "Invalid variance data.");
+  const float mult = 1.125f;
+  bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame];
+  if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX &&
+      best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) {
+    // If we have chroma info, then take it into account
+    var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame];
+  }
+  return var_bad;
+}
+
+static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
+    PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
+    MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
+    const uint8_t (*mode_checked)[REF_FRAMES],
+    const unsigned int (*vars)[REF_FRAMES],
+    const int64_t (*uv_dist)[REF_FRAMES]) {
+  const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode);
+  const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode);
+
+  bool first_ref_valid = false, second_ref_valid = false;
+  bool first_ref_bad = false, second_ref_bad = false;
+  if (mode_checked[single_mode0][ref_frame] &&
+      frame_mv[single_mode0][ref_frame].as_int ==
+          frame_mv[compound_mode][ref_frame].as_int &&
+      vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) {
+    first_ref_valid = true;
+    first_ref_bad =
+        previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist);
+  }
+  if (mode_checked[single_mode1][ref_frame2] &&
+      frame_mv[single_mode1][ref_frame2].as_int ==
+          frame_mv[compound_mode][ref_frame2].as_int &&
+      vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) {
+    second_ref_valid = true;
+    second_ref_bad =
+        previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist);
+  }
+  if (first_ref_valid && second_ref_valid) {
+    return first_ref_bad && second_ref_bad;
+  } else if (first_ref_valid || second_ref_valid) {
+    return first_ref_bad || second_ref_bad;
+  }
+  return false;
+}
+
+// Function to setup parameters used for inter mode evaluation.
+static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
+    AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+    TileDataEnc *tile_data, PICK_MODE_CONTEXT *ctx, RD_STATS *rd_cost,
+    int *force_skip_low_temp_var, int *skip_pred_mv, const int mi_row,
+    const int mi_col, const int gf_temporal_ref, const unsigned char segment_id,
+    BLOCK_SIZE bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    ,
+    int denoise_svc_pickmode
+#endif
+) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  (void)ctx;
+
+  for (int idx = 0; idx < RTC_INTER_MODES; idx++) {
+    for (int ref = 0; ref < REF_FRAMES; ref++) {
+      search_state->vars[idx][ref] = UINT_MAX;
+      search_state->uv_dist[idx][ref] = INT64_MAX;
+    }
+  }
+
+  x->color_sensitivity[0] = x->color_sensitivity_sb[0];
+  x->color_sensitivity[1] = x->color_sensitivity_sb[1];
+  init_best_pickmode(&search_state->best_pickmode);
+
+  estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize,
+                                  search_state->ref_costs_single);
+
+  memset(&search_state->mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
+
+  txfm_info->skip_txfm = 0;
+
+  // initialize mode decisions
+  av1_invalid_rd_stats(&search_state->best_rdc);
+  av1_invalid_rd_stats(&search_state->this_rdc);
+  av1_invalid_rd_stats(rd_cost);
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    x->warp_sample_info[i].num = -1;
+  }
+
+  mi->bsize = bsize;
+  mi->ref_frame[0] = NONE_FRAME;
+  mi->ref_frame[1] = NONE_FRAME;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+    // av1_denoise_svc_non_key(cpi);
+    if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+      av1_denoiser_reset_frame_stats(ctx);
+  }
+#endif
+
+  if (cpi->ref_frame_flags & AOM_LAST_FLAG)
+    find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv, tile_data,
+                    search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+                    x->force_zeromv_skip_for_blk);
+
+  get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
+                         search_state->use_ref_frame_mask,
+                         force_skip_low_temp_var);
+
+  *skip_pred_mv =
+      x->force_zeromv_skip_for_blk ||
+      (x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity[0] != 2 &&
+       x->color_sensitivity[1] != 2);
+
+  // Start at LAST_FRAME + 1.
+  for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1;
+       ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
+    if (search_state->use_ref_frame_mask[ref_frame_iter]) {
+      find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv, tile_data,
+                      search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+                      *skip_pred_mv);
+    }
+  }
+}
+
+// Function to check the inter mode can be skipped based on mode statistics and
+// speed features settings.
+static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
+    AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+    int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *comp_pred,
+    PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame,
+    MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx,
+    int svc_mv_col, int svc_mv_row, int force_skip_low_temp_var,
+    unsigned int sse_zeromv_norm, const int num_inter_modes,
+    const unsigned char segment_id, BLOCK_SIZE bsize,
+    bool comp_use_zero_zeromv_only, bool check_globalmv) {
+  AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+
+  if (idx >= num_inter_modes) {
+    const int comp_index = idx - num_inter_modes;
+    if (!setup_compound_params_from_comp_idx(
+            cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2,
+            search_state->frame_mv, search_state->use_ref_frame_mask,
+            comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame)) {
+      return true;
+    }
+    *comp_pred = 1;
+  } else {
+    *this_mode = ref_mode_set[idx].pred_mode;
+    *ref_frame = ref_mode_set[idx].ref_frame;
+    *ref_frame2 = NONE_FRAME;
+  }
+
+  if (!*comp_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
+    return true;
+  }
+
+  if (!check_globalmv && *this_mode == GLOBALMV) {
+    return true;
+  }
+
+#if COLLECT_PICK_MODE_STAT
+  aom_usec_timer_start(&ms_stat.timer1);
+  ms_stat.num_searches[bsize][*this_mode]++;
+#endif
+  mi->mode = *this_mode;
+  mi->ref_frame[0] = *ref_frame;
+  mi->ref_frame[1] = *ref_frame2;
+
+  if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
+
+  if (x->force_zeromv_skip_for_blk &&
+      ((!(*this_mode == NEARESTMV &&
+          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+        *this_mode != GLOBALMV) ||
+       *ref_frame != LAST_FRAME))
+    return true;
+
+  if (cpi->sf.rt_sf.prune_compoundmode_with_singlemode_var && *comp_pred &&
+      prune_compoundmode_with_singlemode_var(
+          *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv,
+          search_state->mode_checked, search_state->vars,
+          search_state->uv_dist)) {
+    return true;
+  }
+
+  *force_mv_inter_layer = 0;
+  if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+      ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
+       (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) ||
+       (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) {
+    // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+    // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+    // Skip newmv and filter search.
+    *force_mv_inter_layer = 1;
+    if (*this_mode == NEWMV) {
+      search_state->frame_mv[*this_mode][*ref_frame].as_mv.col = svc_mv_col;
+      search_state->frame_mv[*this_mode][*ref_frame].as_mv.row = svc_mv_row;
+    } else if (search_state->frame_mv[*this_mode][*ref_frame].as_mv.col !=
+                   svc_mv_col ||
+               search_state->frame_mv[*this_mode][*ref_frame].as_mv.row !=
+                   svc_mv_row) {
+      return true;
+    }
+  }
+
+  // If the segment reference frame feature is enabled then do nothing if the
+  // current ref frame is not allowed.
+  if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
+    return true;
+
+  // For screen content: for base spatial layer only for now.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      cpi->svc.spatial_layer_id == 0) {
+    // If source_sad is computed: skip non-zero motion
+    // check for stationary (super)blocks. Otherwise if superblock
+    // has motion skip the modes with zero motion for flat blocks,
+    // and color is not set.
+    // For the latter condition: the same condition should apply
+    // to newmv if (0, 0), so this latter condition is repeated
+    // below after search_new_mv.
+    if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+      if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
+           x->content_state_sb.source_sad_nonrd == kZeroSad) ||
+          (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+           x->content_state_sb.source_sad_nonrd != kZeroSad &&
+           ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
+            cpi->rc.high_source_sad) &&
+           x->source_variance == 0))
+        return true;
+    }
+    // Skip NEWMV search for flat blocks.
+    if (*this_mode == NEWMV && x->source_variance < 100) return true;
+    // Skip non-LAST for color on flat blocks.
+    if (*ref_frame > LAST_FRAME && x->source_variance == 0 &&
+        (x->color_sensitivity[0] == 1 || x->color_sensitivity[1] == 1))
+      return true;
+  }
+
+  if (skip_mode_by_bsize_and_ref_frame(
+          *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
+          sse_zeromv_norm, cpi->sf.rt_sf.nonrd_aggressive_skip))
+    return true;
+
+  if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb,
+                            search_state->frame_mv[*this_mode][*ref_frame],
+                            force_skip_low_temp_var))
+    return true;
+
+  // Disable this drop out case if the ref frame segment level feature is
+  // enabled for this segment. This is to prevent the possibility that we
+  // end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
+    if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
+        x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) {
+      if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true;
+    }
+  }
+
+  // Check for skipping NEARMV based on pred_mv_sad.
+  if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX &&
+      x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1))
+    return true;
+
+  if (!*comp_pred) {
+    if (skip_mode_by_threshold(
+            *this_mode, *ref_frame,
+            search_state->frame_mv[*this_mode][*ref_frame],
+            cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize],
+            x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost,
+            search_state->best_pickmode.best_mode_skip_txfm,
+            (cpi->sf.rt_sf.nonrd_aggressive_skip ? 1 : 0)))
+      return true;
+  }
+  return false;
+}
+
 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                   MACROBLOCK *x, RD_STATS *rd_cost,
                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -2711,51 +3554,31 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const InterpFilter filter_ref = cm->features.interp_filter;
   const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
-  BEST_PICKMODE best_pickmode;
-#if COLLECT_PICK_MODE_STAT
-  static mode_search_stat ms_stat;
-#endif
   MV_REFERENCE_FRAME ref_frame, ref_frame2;
-  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
-  int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
-  uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
-  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
-  RD_STATS this_rdc, best_rdc;
   const unsigned char segment_id = mi->segment_id;
-  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
   int best_early_term = 0;
-  unsigned int ref_costs_single[REF_FRAMES];
   int force_skip_low_temp_var = 0;
-  int use_ref_frame_mask[REF_FRAMES] = { 0 };
   unsigned int sse_zeromv_norm = UINT_MAX;
-  // Use mode set that includes zeromv (via globalmv) for speed >= 9 for
-  // content with low motion, and always for force_zeromv_skip.
-  int use_zeromv =
-      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
-      ((cpi->oxcf.speed >= 9 && cpi->rc.avg_frame_low_motion > 70) ||
-       cpi->sf.rt_sf.nonrd_agressive_skip || x->force_zeromv_skip);
   int skip_pred_mv = 0;
-  const int num_inter_modes =
-      use_zeromv ? NUM_INTER_MODES_REDUCED : NUM_INTER_MODES_RT;
-  const REF_MODE *const ref_mode_set =
-      use_zeromv ? ref_mode_set_reduced : ref_mode_set_rt;
-  PRED_BUFFER tmp[4];
+  const int num_inter_modes = NUM_INTER_MODES;
+  bool check_globalmv = cpi->sf.rt_sf.check_globalmv_on_single_ref;
+  PRED_BUFFER tmp_buffer[4];
   DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
   PRED_BUFFER *this_mode_pred = NULL;
   const int reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
                                cm->seq_params->bit_depth == AOM_BITS_8;
+  InterModeSearchStateNonrd search_state;
+  av1_zero(search_state.use_ref_frame_mask);
 
   const int bh = block_size_high[bsize];
   const int bw = block_size_wide[bsize];
   const int pixels_in_block = bh * bw;
   const int num_8x8_blocks = ctx->num_4x4_blk / 4;
   struct buf_2d orig_dst = pd->dst;
-  const CommonQuantParams *quant_params = &cm->quant_params;
   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 #if COLLECT_PICK_MODE_STAT
-  aom_usec_timer_start(&ms_stat.timer2);
+  aom_usec_timer_start(&ms_stat.bsize_timer);
 #endif
   int64_t thresh_sad_pred = INT64_MAX;
   const int mi_row = xd->mi_row;
@@ -2763,13 +3586,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   int svc_mv_col = 0;
   int svc_mv_row = 0;
   int force_mv_inter_layer = 0;
-  int use_modeled_non_rd_cost = 0;
   bool comp_use_zero_zeromv_only = 0;
   int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT;
-  unsigned int zeromv_var[REF_FRAMES];
-  for (int idx = 0; idx < REF_FRAMES; idx++) {
-    zeromv_var[idx] = UINT_MAX;
-  }
 #if CONFIG_AV1_TEMPORAL_DENOISING
   const int denoise_recheck_zeromv = 1;
   AV1_PICKMODE_CTX_DEN ctx_den;
@@ -2777,57 +3595,26 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   int denoise_svc_pickmode = 1;
   const int resize_pending = is_frame_resize_pending(cpi);
 #endif
-  x->color_sensitivity[0] = x->color_sensitivity_sb[0];
-  x->color_sensitivity[1] = x->color_sensitivity_sb[1];
-  init_best_pickmode(&best_pickmode);
-
   const ModeCosts *mode_costs = &x->mode_costs;
 
-  estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize,
-                                  ref_costs_single);
-
-  memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
   if (reuse_inter_pred) {
     for (int i = 0; i < 3; i++) {
-      tmp[i].data = &pred_buf[pixels_in_block * i];
-      tmp[i].stride = bw;
-      tmp[i].in_use = 0;
+      tmp_buffer[i].data = &pred_buf[pixels_in_block * i];
+      tmp_buffer[i].stride = bw;
+      tmp_buffer[i].in_use = 0;
     }
-    tmp[3].data = pd->dst.buf;
-    tmp[3].stride = pd->dst.stride;
-    tmp[3].in_use = 0;
+    tmp_buffer[3].data = pd->dst.buf;
+    tmp_buffer[3].stride = pd->dst.stride;
+    tmp_buffer[3].in_use = 0;
   }
 
-  txfm_info->skip_txfm = 0;
-
-  // initialize mode decisions
-  av1_invalid_rd_stats(&best_rdc);
-  av1_invalid_rd_stats(&this_rdc);
-  av1_invalid_rd_stats(rd_cost);
-  for (int i = 0; i < REF_FRAMES; ++i) {
-    x->warp_sample_info[i].num = -1;
-  }
-
-  mi->bsize = bsize;
-  mi->ref_frame[0] = NONE_FRAME;
-  mi->ref_frame[1] = NONE_FRAME;
-
-#if CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    // if (cpi->ppi->use_svc) denoise_svc_pickmode =
-    // av1_denoise_svc_non_key(cpi);
-    if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
-      av1_denoiser_reset_frame_stats(ctx);
-  }
-#endif
-
   const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
 
   // If the lower spatial layer uses an averaging filter for downsampling
   // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative
   // to source, so use subpel motion vector to compensate. The nonzero motion
   // is half pixel shifted to left and top, so (-4, -4). This has more effect
-  // on higher resolutins, so condition it on that for now.
+  // on higher resolutions, so condition it on that for now.
   if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
       svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
       cm->width * cm->height > 640 * 480) {
@@ -2835,12 +3622,15 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     svc_mv_row = -4;
   }
 
-  get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
-                         use_ref_frame_mask, &force_skip_low_temp_var);
-
-  skip_pred_mv = x->force_zeromv_skip ||
-                 (x->nonrd_prune_ref_frame_search > 2 &&
-                  x->color_sensitivity[0] != 2 && x->color_sensitivity[1] != 2);
+  // Setup parameters used for inter mode evaluation.
+  set_params_nonrd_pick_inter_mode(
+      cpi, x, &search_state, tile_data, ctx, rd_cost, &force_skip_low_temp_var,
+      &skip_pred_mv, mi_row, mi_col, gf_temporal_ref, segment_id, bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+      ,
+      denoise_svc_pickmode
+#endif
+  );
 
   if (cpi->sf.rt_sf.use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
     // Only search compound if bsize \gt BLOCK_16X16.
@@ -2854,49 +3644,29 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     tot_num_comp_modes = 0;
   }
 
-  // Skip compound mode based on sad
-  if (tot_num_comp_modes && cpi->sf.rt_sf.sad_based_comp_prune &&
-      bsize >= BLOCK_64X64 && cpi->src_sad_blk_64x64 &&
-      skip_comp_based_on_sad(cpi, x, mi_row, mi_col, bsize)) {
-    tot_num_comp_modes = 0;
+  if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) {
+    thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
+    // Increase threshold for less aggressive pruning.
+    if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search == 1)
+      thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
   }
 
-  for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME;
-       ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
-    if (use_ref_frame_mask[ref_frame_iter]) {
-      find_predictors(cpi, x, ref_frame_iter, frame_mv, tile_data, yv12_mb,
-                      bsize, force_skip_low_temp_var, skip_pred_mv);
-    }
-  }
-
-  thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
-  // Increase threshold for less agressive pruning.
-  if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search == 1)
-    thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
-
-  const int large_block = bsize >= BLOCK_32X32;
-  const int use_model_yrd_large =
-      cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
-      !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
-      quant_params->base_qindex && cm->seq_params->bit_depth == 8;
-
-  const int enable_filter_search =
-      is_filter_search_enabled(cpi, mi_row, mi_col, bsize, segment_id);
-
-  // TODO(marpan): Look into reducing these conditions. For now constrain
-  // it to avoid significant bdrate loss.
-  if (cpi->sf.rt_sf.use_modeled_non_rd_cost) {
-    if (cpi->svc.non_reference_frame)
-      use_modeled_non_rd_cost = 1;
-    else if (cpi->svc.number_temporal_layers > 1 &&
-             cpi->svc.temporal_layer_id == 0)
-      use_modeled_non_rd_cost = 0;
-    else
-      use_modeled_non_rd_cost =
-          (quant_params->base_qindex > 120 && x->source_variance > 100 &&
-           bsize <= BLOCK_16X16 && !x->content_state_sb.lighting_change &&
-           x->content_state_sb.source_sad_nonrd != kHighSad);
-  }
+  const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+  // decide block-level interp filter search flags:
+  // filter_search_enabled_blk:
+  // 0: disabled
+  // 1: filter search depends on mode properties
+  // 2: filter search forced since prediction is unreliable
+  // cb_pred_filter_search 0: disabled cb prediction
+  InterpFilter filt_select = EIGHTTAP_REGULAR;
+  const int cb_pred_filter_search =
+      x->content_state_sb.source_sad_nonrd > kVeryLowSad
+          ? cpi->sf.interp_sf.cb_pred_filter_search
+          : 0;
+  const int filter_search_enabled_blk =
+      is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id,
+                                   cb_pred_filter_search, &filt_select);
 
 #if COLLECT_PICK_MODE_STAT
   ms_stat.num_blocks[bsize]++;
@@ -2907,17 +3677,29 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
       TX_16X16);
 
-  int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES];
-  if (ref_mode_set == ref_mode_set_reduced) {
-    fill_single_inter_mode_costs(single_inter_mode_costs, num_inter_modes,
-                                 ref_mode_set, mode_costs,
-                                 mbmi_ext->mode_context);
-  }
+  fill_single_inter_mode_costs(search_state.single_inter_mode_costs,
+                               num_inter_modes, ref_mode_set, mode_costs,
+                               mbmi_ext->mode_context);
 
   MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME;
 
+  // Initialize inter prediction params at block level for single reference
+  // mode.
+  InterPredParams inter_pred_params_sr;
+  init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height,
+                          mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd),
+                          /*is_intrabc=*/0);
+  inter_pred_params_sr.conv_params =
+      get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
+
   for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
-    const struct segmentation *const seg = &cm->seg;
+    // If we are at the first compound mode, and the single modes already
+    // perform well, then end the search.
+    if (cpi->sf.rt_sf.skip_compound_based_on_var && idx == num_inter_modes &&
+        skip_comp_based_on_var(search_state.vars, bsize)) {
+      break;
+    }
 
     int rate_mv = 0;
     int is_skippable;
@@ -2931,128 +3713,20 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     memset(txfm_info->blk_skip, 0,
            sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
 
-    if (idx >= num_inter_modes) {
-      const int comp_index = idx - num_inter_modes;
-      if (!setup_compound_params_from_comp_idx(
-              cpi, x, yv12_mb, &this_mode, &ref_frame, &ref_frame2, frame_mv,
-              use_ref_frame_mask, comp_index, comp_use_zero_zeromv_only,
-              &last_comp_ref_frame)) {
-        continue;
-      }
-      comp_pred = 1;
-    } else {
-      this_mode = ref_mode_set[idx].pred_mode;
-      ref_frame = ref_mode_set[idx].ref_frame;
-      ref_frame2 = NONE_FRAME;
-    }
-
-    if (!comp_pred && mode_checked[this_mode][ref_frame]) {
-      continue;
-    }
-
-#if COLLECT_PICK_MODE_STAT
-    aom_usec_timer_start(&ms_stat.timer1);
-    ms_stat.num_searches[bsize][this_mode]++;
-#endif
-    mi->mode = this_mode;
-    mi->ref_frame[0] = ref_frame;
-    mi->ref_frame[1] = ref_frame2;
-
-    if (!use_ref_frame_mask[ref_frame]) continue;
-
-    if (x->force_zeromv_skip &&
-        (this_mode != GLOBALMV || ref_frame != LAST_FRAME))
-      continue;
-
-    force_mv_inter_layer = 0;
-    if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
-        ((ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
-         (ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) ||
-         (ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) {
-      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
-      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
-      // Skip newmv and filter search.
-      force_mv_inter_layer = 1;
-      if (this_mode == NEWMV) {
-        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
-        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
-      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
-                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
-        continue;
-      }
-    }
-
-    // If the segment reference frame feature is enabled then do nothing if the
-    // current ref frame is not allowed.
-    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+    // Check the inter mode can be skipped based on mode statistics and speed
+    // features settings.
+    if (skip_inter_mode_nonrd(
+            cpi, x, &search_state, &thresh_sad_pred, &force_mv_inter_layer,
+            &comp_pred, &this_mode, &last_comp_ref_frame, &ref_frame,
+            &ref_frame2, idx, svc_mv_col, svc_mv_row, force_skip_low_temp_var,
+            sse_zeromv_norm, num_inter_modes, segment_id, bsize,
+            comp_use_zero_zeromv_only, check_globalmv))
       continue;
 
-    // For screen content:
-    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
-      // If source_sad is computed: skip non-zero motion
-      // check for stationary (super)blocks. Otherwise if superblock
-      // has motion skip the modes with zero motion for flat blocks,
-      // and color is not set.
-      // For the latter condition: the same condition should apply
-      // to newmv if (0, 0), so this latter condition is repeated
-      // below after search_new_mv.
-      if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
-        if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
-             x->content_state_sb.source_sad_nonrd == kZeroSad) ||
-            (frame_mv[this_mode][ref_frame].as_int == 0 &&
-             x->content_state_sb.source_sad_nonrd != kZeroSad &&
-             ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
-              cpi->rc.high_source_sad) &&
-             x->source_variance == 0))
-          continue;
-      }
-      // Skip NEWMV search for flat blocks.
-      if (this_mode == NEWMV && x->source_variance < 100) continue;
-      // Skip non-LAST for color on flat blocks.
-      if (ref_frame > LAST_FRAME && x->source_variance == 0 &&
-          (x->color_sensitivity[0] == 1 || x->color_sensitivity[1] == 1))
-        continue;
-    }
-
-    if (skip_mode_by_bsize_and_ref_frame(
-            this_mode, ref_frame, bsize, x->nonrd_prune_ref_frame_search,
-            sse_zeromv_norm, cpi->sf.rt_sf.nonrd_agressive_skip))
-      continue;
-
-    if (skip_mode_by_low_temp(this_mode, ref_frame, bsize, x->content_state_sb,
-                              frame_mv[this_mode][ref_frame],
-                              force_skip_low_temp_var))
-      continue;
-
-    // Disable this drop out case if the ref frame segment level feature is
-    // enabled for this segment. This is to prevent the possibility that we
-    // end up unable to pick any mode.
-    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-      // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
-      if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
-          x->pred_mv_sad[ref_frame] != INT_MAX && ref_frame != LAST_FRAME) {
-        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) continue;
-      }
-    }
-    // Check for skipping NEARMV based on pred_mv_sad.
-    if (this_mode == NEARMV && x->pred_mv1_sad[ref_frame] != INT_MAX &&
-        x->pred_mv1_sad[ref_frame] > (x->pred_mv0_sad[ref_frame] << 1))
-      continue;
-
-    if (!comp_pred) {
-      if (skip_mode_by_threshold(
-              this_mode, ref_frame, frame_mv[this_mode][ref_frame],
-              cpi->rc.frames_since_golden, rd_threshes, rd_thresh_freq_fact,
-              best_rdc.rdcost, best_pickmode.best_mode_skip_txfm,
-              (cpi->sf.rt_sf.nonrd_agressive_skip ? 1 : 0)))
-        continue;
-    }
-
     // Select prediction reference frames.
     for (int i = 0; i < MAX_MB_PLANE; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
-      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[ref_frame2][i];
+      xd->plane[i].pre[0] = search_state.yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = search_state.yv12_mb[ref_frame2][i];
     }
 
     mi->ref_frame[0] = ref_frame;
@@ -3063,9 +3737,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_start(&ms_stat.timer2);
 #endif
-      const bool skip_newmv =
-          search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
-                        mi_row, mi_col, &rate_mv, &best_rdc);
+      const bool skip_newmv = search_new_mv(
+          cpi, x, search_state.frame_mv, ref_frame, gf_temporal_ref, bsize,
+          mi_row, mi_col, &rate_mv, &search_state.best_rdc);
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_mark(&ms_stat.timer2);
       ms_stat.ms_time[bsize][this_mode] +=
@@ -3079,9 +3753,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
          inter_mv_mode++) {
       if (inter_mv_mode == this_mode) continue;
-      if (!comp_pred && mode_checked[inter_mv_mode][ref_frame] &&
-          frame_mv[this_mode][ref_frame].as_int ==
-              frame_mv[inter_mv_mode][ref_frame].as_int) {
+      if (!comp_pred && search_state.mode_checked[inter_mv_mode][ref_frame] &&
+          search_state.frame_mv[this_mode][ref_frame].as_int ==
+              search_state.frame_mv[inter_mv_mode][ref_frame].as_int) {
         skip_this_mv = 1;
         break;
       }
@@ -3093,8 +3767,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     // skip newmv if the motion vector is (0, 0), and color is not set.
     if (this_mode == NEWMV &&
         cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+        cpi->svc.spatial_layer_id == 0 &&
         cpi->sf.rt_sf.source_metrics_sb_nonrd) {
-      if (frame_mv[this_mode][ref_frame].as_int == 0 &&
+      if (search_state.frame_mv[this_mode][ref_frame].as_int == 0 &&
           x->content_state_sb.source_sad_nonrd != kZeroSad &&
           ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
            cpi->rc.high_source_sad) &&
@@ -3103,37 +3778,43 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     mi->mode = this_mode;
-    mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mv[0].as_int = search_state.frame_mv[this_mode][ref_frame].as_int;
     mi->mv[1].as_int = 0;
-    if (comp_pred) mi->mv[1].as_int = frame_mv[this_mode][ref_frame2].as_int;
+    if (comp_pred)
+      mi->mv[1].as_int = search_state.frame_mv[this_mode][ref_frame2].as_int;
 
     if (reuse_inter_pred) {
       if (!this_mode_pred) {
-        this_mode_pred = &tmp[3];
+        this_mode_pred = &tmp_buffer[3];
       } else {
-        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
         pd->dst.buf = this_mode_pred->data;
         pd->dst.stride = bw;
       }
     }
-#if COLLECT_PICK_MODE_STAT
-    ms_stat.num_nonskipped_searches[bsize][this_mode]++;
-#endif
 
     if (idx == 0 && !skip_pred_mv) {
       // Set color sensitivity on first tested mode only.
       // Use y-sad already computed in find_predictors: take the sad with motion
       // vector closest to 0; the uv-sad computed below in set_color_sensitivity
       // is for zeromv.
-      int y_sad = x->pred_mv0_sad[LAST_FRAME];
-      if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
-          (abs(frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
-           abs(frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
-              (abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
-               abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
-        y_sad = x->pred_mv1_sad[LAST_FRAME];
-      set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
-                            yv12_mb[LAST_FRAME]);
+      // For screen: first check if golden reference is being used, if so,
+      // force color_sensitivity on if the color sensitivity for sb_g is on.
+      if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+          search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
+        if (x->color_sensitivity_sb_g[0] == 1) x->color_sensitivity[0] = 1;
+        if (x->color_sensitivity_sb_g[1] == 1) x->color_sensitivity[1] = 1;
+      } else {
+        int y_sad = x->pred_mv0_sad[LAST_FRAME];
+        if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+            (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+             abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+                (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+                 abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+          y_sad = x->pred_mv1_sad[LAST_FRAME];
+        set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
+                              search_state.yv12_mb[LAST_FRAME]);
+      }
     }
     mi->motion_mode = SIMPLE_TRANSLATION;
 #if !CONFIG_REALTIME_ONLY
@@ -3141,16 +3822,37 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       calc_num_proj_ref(cpi, x, mi);
     }
 #endif
-
-    if (enable_filter_search && !force_mv_inter_layer && !comp_pred &&
-        ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
-        (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
+    // set variance threshold for compound more pruning
+    unsigned int var_threshold = UINT_MAX;
+    if (cpi->sf.rt_sf.prune_compoundmode_with_singlecompound_var && comp_pred &&
+        use_model_yrd_large) {
+      const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode);
+      const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode);
+      var_threshold =
+          AOMMIN(var_threshold,
+                 search_state.vars[INTER_OFFSET(single_mode0)][ref_frame]);
+      var_threshold =
+          AOMMIN(var_threshold,
+                 search_state.vars[INTER_OFFSET(single_mode1)][ref_frame2]);
+    }
+    // decide interpolation filter, build prediction signal, get sse
+    const bool is_mv_subpel =
+        (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07);
+    const bool enable_filt_search_this_mode =
+        (filter_search_enabled_blk == 2)
+            ? true
+            : (filter_search_enabled_blk && !force_mv_inter_layer &&
+               !comp_pred &&
+               (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search));
+    if (is_mv_subpel && enable_filt_search_this_mode) {
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_start(&ms_stat.timer2);
 #endif
-      search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
-                        reuse_inter_pred, &this_mode_pred, &this_early_term,
-                        use_model_yrd_large, best_pickmode.best_sse);
+      search_filter_ref(cpi, x, &search_state.this_rdc, &inter_pred_params_sr,
+                        mi_row, mi_col, tmp_buffer, bsize, reuse_inter_pred,
+                        &this_mode_pred, &this_early_term, &var,
+                        use_model_yrd_large,
+                        search_state.best_pickmode.best_sse, comp_pred);
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_mark(&ms_stat.timer2);
       ms_stat.ifs_time[bsize][this_mode] +=
@@ -3159,11 +3861,11 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #if !CONFIG_REALTIME_ONLY
     } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
                this_mode == NEWMV) {
-      search_motion_mode(cpi, x, &this_rdc, mi_row, mi_col, bsize,
+      search_motion_mode(cpi, x, &search_state.this_rdc, mi_row, mi_col, bsize,
                          &this_early_term, use_model_yrd_large, &rate_mv,
-                         best_pickmode.best_sse);
+                         search_state.best_pickmode.best_sse);
       if (this_mode == NEWMV) {
-        frame_mv[this_mode][ref_frame] = mi->mv[0];
+        search_state.frame_mv[this_mode][ref_frame] = mi->mv[0];
       }
 #endif
     } else {
@@ -3174,50 +3876,35 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       if (force_mv_inter_layer)
         mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
-      // If it is sub-pel motion and best filter was not selected in
-      // search_filter_ref() for all blocks, then check top and left values and
-      // force smooth if both were selected to be smooth.
-      if (cpi->sf.interp_sf.cb_pred_filter_search &&
-          (mi->mv[0].as_mv.row & 0x07 || mi->mv[0].as_mv.col & 0x07)) {
-        if (xd->left_mbmi && xd->above_mbmi) {
-          if ((xd->left_mbmi->interp_filters.as_filters.x_filter ==
-                   EIGHTTAP_SMOOTH &&
-               xd->above_mbmi->interp_filters.as_filters.x_filter ==
-                   EIGHTTAP_SMOOTH))
-            mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_SMOOTH);
-        }
-      }
+      // If it is sub-pel motion and cb_pred_filter_search is enabled, select
+      // the pre-decided filter
+      if (is_mv_subpel && cb_pred_filter_search)
+        mi->interp_filters = av1_broadcast_interp_filter(filt_select);
+
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_start(&ms_stat.timer2);
 #endif
-      if (!comp_pred)
-        av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
-      else
+      if (!comp_pred) {
+        SubpelParams subpel_params;
+        // Initialize inter mode level params for single reference mode.
+        init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr,
+                               &subpel_params, xd->block_ref_scale_factors[0],
+                               pd->pre->width, pd->pre->height);
+        av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr,
+                                              &subpel_params);
+      } else {
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                       0);
+      }
 
       if (use_model_yrd_large) {
-        unsigned int var_threshold = UINT_MAX;
-        if (cpi->sf.rt_sf.prune_global_globalmv_with_zeromv &&
-            this_mode == GLOBAL_GLOBALMV) {
-          var_threshold = AOMMIN(var_threshold, zeromv_var[ref_frame]);
-          var_threshold = AOMMIN(var_threshold, zeromv_var[ref_frame2]);
-        }
-
-        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,
-                                  &this_early_term, use_modeled_non_rd_cost,
-                                  best_pickmode.best_sse, &var, var_threshold);
-        if (!comp_pred && frame_mv[this_mode][ref_frame].as_int == 0) {
-          zeromv_var[ref_frame] = var;
-        } else if (this_mode == GLOBAL_GLOBALMV) {
-          if (var > var_threshold) {
-            if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
-            continue;
-          }
-        }
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                  &search_state.this_rdc, &this_early_term, 0,
+                                  search_state.best_pickmode.best_sse, &var,
+                                  var_threshold);
       } else {
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc,
-                          use_modeled_non_rd_cost);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &search_state.this_rdc, &var, 0,
+                          &this_early_term);
       }
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_mark(&ms_stat.timer2);
@@ -3225,57 +3912,68 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
           aom_usec_timer_elapsed(&ms_stat.timer2);
 #endif
     }
+    // update variance for single mode
+    if (!comp_pred) {
+      search_state.vars[INTER_OFFSET(this_mode)][ref_frame] = var;
+      if (search_state.frame_mv[this_mode][ref_frame].as_int == 0) {
+        search_state.vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+      }
+    }
+    // prune compound mode based on single mode var threshold
+    if (comp_pred && var > var_threshold) {
+      if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+      continue;
+    }
 
-    if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) {
-      sse_zeromv_norm =
-          (unsigned int)(this_rdc.sse >> (b_width_log2_lookup[bsize] +
-                                          b_height_log2_lookup[bsize]));
+    if (ref_frame == LAST_FRAME &&
+        search_state.frame_mv[this_mode][ref_frame].as_int == 0) {
+      sse_zeromv_norm = (unsigned int)(search_state.this_rdc.sse >>
+                                       (b_width_log2_lookup[bsize] +
+                                        b_height_log2_lookup[bsize]));
     }
 
     if (cpi->sf.rt_sf.sse_early_term_inter_search &&
         early_term_inter_search_with_sse(
-            cpi->sf.rt_sf.sse_early_term_inter_search, bsize, this_rdc.sse,
-            best_pickmode.best_sse, this_mode)) {
+            cpi->sf.rt_sf.sse_early_term_inter_search, bsize,
+            search_state.this_rdc.sse, search_state.best_pickmode.best_sse,
+            this_mode)) {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
       continue;
     }
 
+#if COLLECT_PICK_MODE_STAT
+    ms_stat.num_nonskipped_searches[bsize][this_mode]++;
+#endif
+
     const int skip_ctx = av1_get_skip_txfm_context(xd);
     const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
     const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
-    const int64_t sse_y = this_rdc.sse;
+    const int64_t sse_y = search_state.this_rdc.sse;
     if (this_early_term) {
-      this_rdc.skip_txfm = 1;
-      this_rdc.rate = skip_txfm_cost;
-      this_rdc.dist = this_rdc.sse << 4;
+      search_state.this_rdc.skip_txfm = 1;
+      search_state.this_rdc.rate = skip_txfm_cost;
+      search_state.this_rdc.dist = search_state.this_rdc.sse << 4;
     } else {
-      if (use_modeled_non_rd_cost) {
-        if (this_rdc.skip_txfm) {
-          this_rdc.rate = skip_txfm_cost;
-        } else {
-          this_rdc.rate += no_skip_txfm_cost;
-        }
-      } else {
 #if COLLECT_PICK_MODE_STAT
-        aom_usec_timer_start(&ms_stat.timer2);
+      aom_usec_timer_start(&ms_stat.timer2);
 #endif
-        av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, bsize,
-                      mi->tx_size, DCT_DCT, 1);
-        if (this_rdc.skip_txfm ||
-            RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
-                RDCOST(x->rdmult, 0, this_rdc.sse)) {
-          if (!this_rdc.skip_txfm) {
-            // Need to store "real" rdc for possible furure use if UV rdc
-            // disallows tx skip
-            nonskip_rdc = this_rdc;
-            nonskip_rdc.rate += no_skip_txfm_cost;
-          }
-          this_rdc.rate = skip_txfm_cost;
-          this_rdc.skip_txfm = 1;
-          this_rdc.dist = this_rdc.sse;
-        } else {
-          this_rdc.rate += no_skip_txfm_cost;
+      block_yrd(x, &search_state.this_rdc, &is_skippable, bsize, mi->tx_size,
+                1);
+      if (search_state.this_rdc.skip_txfm ||
+          RDCOST(x->rdmult, search_state.this_rdc.rate,
+                 search_state.this_rdc.dist) >=
+              RDCOST(x->rdmult, 0, search_state.this_rdc.sse)) {
+        if (!search_state.this_rdc.skip_txfm) {
+          // Need to store "real" rdc for possible future use if UV rdc
+          // disallows tx skip
+          nonskip_rdc = search_state.this_rdc;
+          nonskip_rdc.rate += no_skip_txfm_cost;
         }
+        search_state.this_rdc.rate = skip_txfm_cost;
+        search_state.this_rdc.skip_txfm = 1;
+        search_state.this_rdc.dist = search_state.this_rdc.sse;
+      } else {
+        search_state.this_rdc.rate += no_skip_txfm_cost;
       }
       if ((x->color_sensitivity[0] || x->color_sensitivity[1])) {
         RD_STATS rdc_uv;
@@ -3289,14 +3987,21 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
                                         AOM_PLANE_V, AOM_PLANE_V);
         }
-        model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &this_rdc.sse, 1, 2);
+        const int64_t sse_uv =
+            model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, 1, 2);
+        search_state.this_rdc.sse += sse_uv;
         // Restore Y rdc if UV rdc disallows txfm skip
-        if (this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
+        if (search_state.this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
             nonskip_rdc.rate != INT_MAX)
-          this_rdc = nonskip_rdc;
-        this_rdc.rate += rdc_uv.rate;
-        this_rdc.dist += rdc_uv.dist;
-        this_rdc.skip_txfm = this_rdc.skip_txfm && rdc_uv.skip_txfm;
+          search_state.this_rdc = nonskip_rdc;
+        if (!comp_pred) {
+          search_state.uv_dist[INTER_OFFSET(this_mode)][ref_frame] =
+              rdc_uv.dist;
+        }
+        search_state.this_rdc.rate += rdc_uv.rate;
+        search_state.this_rdc.dist += rdc_uv.dist;
+        search_state.this_rdc.skip_txfm =
+            search_state.this_rdc.skip_txfm && rdc_uv.skip_txfm;
       }
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_mark(&ms_stat.timer2);
@@ -3307,117 +4012,140 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     PREDICTION_MODE this_best_mode = this_mode;
 
     // TODO(kyslov) account for UV prediction cost
-    this_rdc.rate += rate_mv;
-    if (comp_pred || ref_mode_set != ref_mode_set_reduced) {
+    search_state.this_rdc.rate += rate_mv;
+    if (comp_pred) {
       const int16_t mode_ctx =
           av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
-      this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx);
+      search_state.this_rdc.rate +=
+          cost_mv_ref(mode_costs, this_mode, mode_ctx);
     } else {
       // If the current mode has zeromv but is not GLOBALMV, compare the rate
       // cost. If GLOBALMV is cheaper, use GLOBALMV instead.
-      if (this_mode != GLOBALMV && frame_mv[this_mode][ref_frame].as_int ==
-                                       frame_mv[GLOBALMV][ref_frame].as_int) {
+      if (this_mode != GLOBALMV &&
+          search_state.frame_mv[this_mode][ref_frame].as_int ==
+              search_state.frame_mv[GLOBALMV][ref_frame].as_int) {
         if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs,
-                               single_inter_mode_costs, mbmi_ext)) {
+                               search_state.single_inter_mode_costs,
+                               mbmi_ext)) {
           this_best_mode = GLOBALMV;
         }
       }
 
-      this_rdc.rate +=
-          single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame];
+      search_state.this_rdc.rate +=
+          search_state
+              .single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame];
     }
 
-    if (!comp_pred && frame_mv[this_mode][ref_frame].as_int == 0 &&
+    if (!comp_pred && search_state.frame_mv[this_mode][ref_frame].as_int == 0 &&
         var < UINT_MAX) {
-      zeromv_var[ref_frame] = var;
+      search_state.vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
     }
 
-    this_rdc.rate += ref_costs_single[ref_frame];
+    search_state.this_rdc.rate += search_state.ref_costs_single[ref_frame];
 
-    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+    search_state.this_rdc.rdcost = RDCOST(x->rdmult, search_state.this_rdc.rate,
+                                          search_state.this_rdc.dist);
     if (cpi->oxcf.rc_cfg.mode == AOM_CBR && !comp_pred) {
-      newmv_diff_bias(xd, this_best_mode, &this_rdc, bsize,
-                      frame_mv[this_best_mode][ref_frame].as_mv.row,
-                      frame_mv[this_best_mode][ref_frame].as_mv.col, cpi->speed,
-                      x->source_variance, x->content_state_sb);
+      newmv_diff_bias(
+          xd, this_best_mode, &search_state.this_rdc, bsize,
+          search_state.frame_mv[this_best_mode][ref_frame].as_mv.row,
+          search_state.frame_mv[this_best_mode][ref_frame].as_mv.col,
+          cpi->speed, x->source_variance, x->content_state_sb);
     }
 #if CONFIG_AV1_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
         cpi->denoiser.denoising_level > kDenLowLow) {
       av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
       // Keep track of zero_last cost.
-      if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
-        zero_last_cost_orig = this_rdc.rdcost;
+      if (ref_frame == LAST_FRAME &&
+          search_state.frame_mv[this_mode][ref_frame].as_int == 0)
+        zero_last_cost_orig = search_state.this_rdc.rdcost;
     }
 #else
     (void)sse_y;
 #endif
 
-    mode_checked[this_mode][ref_frame] = 1;
-    mode_checked[this_best_mode][ref_frame] = 1;
+    search_state.mode_checked[this_mode][ref_frame] = 1;
+    search_state.mode_checked[this_best_mode][ref_frame] = 1;
+
+    if (check_globalmv) {
+      int32_t abs_mv =
+          abs(search_state.frame_mv[this_best_mode][ref_frame].as_mv.row) +
+          abs(search_state.frame_mv[this_best_mode][ref_frame].as_mv.col);
+      // Early exit check: if the magnitude of this_best_mode's mv is small
+      // enough, we skip GLOBALMV check in the next loop iteration.
+      if (abs_mv < 2) {
+        check_globalmv = false;
+      }
+    }
 #if COLLECT_PICK_MODE_STAT
     aom_usec_timer_mark(&ms_stat.timer1);
     ms_stat.nonskipped_search_times[bsize][this_mode] +=
         aom_usec_timer_elapsed(&ms_stat.timer1);
 #endif
-    if (this_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = this_rdc;
+    if (search_state.this_rdc.rdcost < search_state.best_rdc.rdcost) {
+      search_state.best_rdc = search_state.this_rdc;
       best_early_term = this_early_term;
-      best_pickmode.best_sse = sse_y;
-      best_pickmode.best_mode = this_best_mode;
-      best_pickmode.best_motion_mode = mi->motion_mode;
-      best_pickmode.wm_params = mi->wm_params;
-      best_pickmode.num_proj_ref = mi->num_proj_ref;
-      best_pickmode.best_pred_filter = mi->interp_filters;
-      best_pickmode.best_tx_size = mi->tx_size;
-      best_pickmode.best_ref_frame = ref_frame;
-      best_pickmode.best_second_ref_frame = ref_frame2;
-      best_pickmode.best_mode_skip_txfm = this_rdc.skip_txfm;
-      best_pickmode.best_mode_initial_skip_flag =
-          (nonskip_rdc.rate == INT_MAX && this_rdc.skip_txfm);
-      if (!best_pickmode.best_mode_skip_txfm && !use_modeled_non_rd_cost) {
-        memcpy(best_pickmode.blk_skip, txfm_info->blk_skip,
+      search_state.best_pickmode.best_sse = sse_y;
+      search_state.best_pickmode.best_mode = this_best_mode;
+      search_state.best_pickmode.best_motion_mode = mi->motion_mode;
+      search_state.best_pickmode.wm_params = mi->wm_params;
+      search_state.best_pickmode.num_proj_ref = mi->num_proj_ref;
+      search_state.best_pickmode.best_pred_filter = mi->interp_filters;
+      search_state.best_pickmode.best_tx_size = mi->tx_size;
+      search_state.best_pickmode.best_ref_frame = ref_frame;
+      search_state.best_pickmode.best_second_ref_frame = ref_frame2;
+      search_state.best_pickmode.best_mode_skip_txfm =
+          search_state.this_rdc.skip_txfm;
+      search_state.best_pickmode.best_mode_initial_skip_flag =
+          (nonskip_rdc.rate == INT_MAX && search_state.this_rdc.skip_txfm);
+      if (!search_state.best_pickmode.best_mode_skip_txfm) {
+        memcpy(search_state.best_pickmode.blk_skip, txfm_info->blk_skip,
                sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
       }
 
       // This is needed for the compound modes.
-      frame_mv_best[this_best_mode][ref_frame].as_int =
-          frame_mv[this_best_mode][ref_frame].as_int;
+      search_state.frame_mv_best[this_best_mode][ref_frame].as_int =
+          search_state.frame_mv[this_best_mode][ref_frame].as_int;
       if (ref_frame2 > NONE_FRAME) {
-        frame_mv_best[this_best_mode][ref_frame2].as_int =
-            frame_mv[this_best_mode][ref_frame2].as_int;
+        search_state.frame_mv_best[this_best_mode][ref_frame2].as_int =
+            search_state.frame_mv[this_best_mode][ref_frame2].as_int;
       }
 
       if (reuse_inter_pred) {
-        free_pred_buffer(best_pickmode.best_pred);
-        best_pickmode.best_pred = this_mode_pred;
+        free_pred_buffer(search_state.best_pickmode.best_pred);
+        search_state.best_pickmode.best_pred = this_mode_pred;
       }
     } else {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
     }
-    if (best_early_term && (idx > 0 || cpi->sf.rt_sf.nonrd_agressive_skip)) {
+    if (best_early_term && (idx > 0 || cpi->sf.rt_sf.nonrd_aggressive_skip)) {
       txfm_info->skip_txfm = 1;
       break;
     }
   }
 
-  mi->mode = best_pickmode.best_mode;
-  mi->motion_mode = best_pickmode.best_motion_mode;
-  mi->wm_params = best_pickmode.wm_params;
-  mi->num_proj_ref = best_pickmode.num_proj_ref;
-  mi->interp_filters = best_pickmode.best_pred_filter;
-  mi->tx_size = best_pickmode.best_tx_size;
+  mi->mode = search_state.best_pickmode.best_mode;
+  mi->motion_mode = search_state.best_pickmode.best_motion_mode;
+  mi->wm_params = search_state.best_pickmode.wm_params;
+  mi->num_proj_ref = search_state.best_pickmode.num_proj_ref;
+  mi->interp_filters = search_state.best_pickmode.best_pred_filter;
+  mi->tx_size = search_state.best_pickmode.best_tx_size;
   memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
-  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[0] = search_state.best_pickmode.best_ref_frame;
   mi->mv[0].as_int =
-      frame_mv_best[best_pickmode.best_mode][best_pickmode.best_ref_frame]
+      search_state
+          .frame_mv_best[search_state.best_pickmode.best_mode]
+                        [search_state.best_pickmode.best_ref_frame]
           .as_int;
   mi->mv[1].as_int = 0;
-  if (best_pickmode.best_second_ref_frame > INTRA_FRAME) {
-    mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
-    mi->mv[1].as_int = frame_mv_best[best_pickmode.best_mode]
-                                    [best_pickmode.best_second_ref_frame]
-                                        .as_int;
+  if (search_state.best_pickmode.best_second_ref_frame > INTRA_FRAME) {
+    mi->ref_frame[1] = search_state.best_pickmode.best_second_ref_frame;
+    mi->mv[1].as_int =
+        search_state
+            .frame_mv_best[search_state.best_pickmode.best_mode]
+                          [search_state.best_pickmode.best_second_ref_frame]
+            .as_int;
   }
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
@@ -3428,72 +4156,88 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #if COLLECT_PICK_MODE_STAT
   aom_usec_timer_start(&ms_stat.timer1);
   ms_stat.num_searches[bsize][DC_PRED]++;
+  ms_stat.num_nonskipped_searches[bsize][DC_PRED]++;
 #endif
 
-  if (!x->force_zeromv_skip)
-    estimate_intra_mode(cpi, x, bsize, use_modeled_non_rd_cost, best_early_term,
-                        ref_costs_single[INTRA_FRAME], reuse_inter_pred,
-                        &orig_dst, tmp, &this_mode_pred, &best_rdc,
-                        &best_pickmode, ctx);
-
-  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-      !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip &&
-      is_inter_mode(best_pickmode.best_mode) &&
+  if (!x->force_zeromv_skip_for_blk)
+    estimate_intra_mode(cpi, x, bsize, best_early_term,
+                        search_state.ref_costs_single[INTRA_FRAME],
+                        reuse_inter_pred, &orig_dst, tmp_buffer,
+                        &this_mode_pred, &search_state.best_rdc,
+                        &search_state.best_pickmode, ctx);
+
+  int skip_idtx_palette =
+      (x->color_sensitivity[0] || x->color_sensitivity[1]) &&
+      x->content_state_sb.source_sad_nonrd != kZeroSad &&
+      !cpi->rc.high_source_sad;
+
+  // Check for IDTX: based only on Y channel, so avoid when color_sensitivity
+  // is set.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
+      !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
+      is_inter_mode(search_state.best_pickmode.best_mode) &&
       (!cpi->sf.rt_sf.prune_idtx_nonrd ||
        (cpi->sf.rt_sf.prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
-        best_pickmode.best_mode_skip_txfm != 1 && x->source_variance > 200))) {
+        search_state.best_pickmode.best_mode_skip_txfm != 1 &&
+        x->source_variance > 200))) {
     RD_STATS idtx_rdc;
     av1_init_rd_stats(&idtx_rdc);
     int is_skippable;
-    this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+    this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
     pd->dst.buf = this_mode_pred->data;
     pd->dst.stride = bw;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
-    av1_block_yrd(cpi, x, mi_row, mi_col, &idtx_rdc, &is_skippable, bsize,
-                  mi->tx_size, IDTX, 1);
+    block_yrd_idtx(x, &idtx_rdc, &is_skippable, bsize, mi->tx_size);
     int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
-    if (idx_rdcost < best_rdc.rdcost) {
-      // Keep the skip_txfm off if the color_sensitivity is set,
-      // for scene/slide change.
-      if (cpi->rc.high_source_sad &&
-          (x->color_sensitivity[0] || x->color_sensitivity[1]))
+    if (idx_rdcost < search_state.best_rdc.rdcost) {
+      // Keep the skip_txfm off if the color_sensitivity is set.
+      if (x->color_sensitivity[0] || x->color_sensitivity[1])
         idtx_rdc.skip_txfm = 0;
-      best_pickmode.tx_type = IDTX;
-      best_rdc.rdcost = idx_rdcost;
-      best_pickmode.best_mode_skip_txfm = idtx_rdc.skip_txfm;
+      search_state.best_pickmode.tx_type = IDTX;
+      search_state.best_rdc.rdcost = idx_rdcost;
+      search_state.best_pickmode.best_mode_skip_txfm = idtx_rdc.skip_txfm;
       if (!idtx_rdc.skip_txfm) {
-        memcpy(best_pickmode.blk_skip, txfm_info->blk_skip,
+        memcpy(search_state.best_pickmode.blk_skip, txfm_info->blk_skip,
                sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
       }
-      xd->tx_type_map[0] = best_pickmode.tx_type;
-      memset(ctx->tx_type_map, best_pickmode.tx_type, ctx->num_4x4_blk);
-      memset(xd->tx_type_map, best_pickmode.tx_type, ctx->num_4x4_blk);
+      xd->tx_type_map[0] = search_state.best_pickmode.tx_type;
+      memset(ctx->tx_type_map, search_state.best_pickmode.tx_type,
+             ctx->num_4x4_blk);
+      memset(xd->tx_type_map, search_state.best_pickmode.tx_type,
+             ctx->num_4x4_blk);
     }
     pd->dst = orig_dst;
   }
 
   int try_palette =
-      cpi->oxcf.tool_cfg.enable_palette &&
+      !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
                         mi->bsize);
-  try_palette = try_palette && is_mode_intra(best_pickmode.best_mode) &&
-                x->source_variance > 0 && !x->force_zeromv_skip &&
+  try_palette = try_palette &&
+                is_mode_intra(search_state.best_pickmode.best_mode) &&
+                x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
                 (cpi->rc.high_source_sad || x->source_variance > 500);
 
   if (try_palette) {
-    const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+    const unsigned int intra_ref_frame_cost =
+        search_state.ref_costs_single[INTRA_FRAME];
 
     av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
-                                 &this_rdc, best_rdc.rdcost);
-    if (this_rdc.rdcost < best_rdc.rdcost) {
-      best_pickmode.pmi = mi->palette_mode_info;
-      best_pickmode.best_mode = DC_PRED;
+                                 &search_state.this_rdc,
+                                 search_state.best_rdc.rdcost);
+    if (search_state.this_rdc.rdcost < search_state.best_rdc.rdcost) {
+      search_state.best_pickmode.pmi = mi->palette_mode_info;
+      search_state.best_pickmode.best_mode = DC_PRED;
       mi->mv[0].as_int = 0;
-      best_rdc.rate = this_rdc.rate;
-      best_rdc.dist = this_rdc.dist;
-      best_rdc.rdcost = this_rdc.rdcost;
-      best_pickmode.best_mode_skip_txfm = this_rdc.skip_txfm;
-      if (!this_rdc.skip_txfm) {
+      search_state.best_rdc.rate = search_state.this_rdc.rate;
+      search_state.best_rdc.dist = search_state.this_rdc.dist;
+      search_state.best_rdc.rdcost = search_state.this_rdc.rdcost;
+      search_state.best_pickmode.best_mode_skip_txfm =
+          search_state.this_rdc.skip_txfm;
+      // Keep the skip_txfm off if the color_sensitivity is set.
+      if (x->color_sensitivity[0] || x->color_sensitivity[1])
+        search_state.this_rdc.skip_txfm = 0;
+      if (!search_state.this_rdc.skip_txfm) {
         memcpy(ctx->blk_skip, txfm_info->blk_skip,
                sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
       }
@@ -3509,18 +4253,18 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif
 
   pd->dst = orig_dst;
-  if (try_palette) mi->palette_mode_info = best_pickmode.pmi;
-  mi->mode = best_pickmode.best_mode;
-  mi->ref_frame[0] = best_pickmode.best_ref_frame;
-  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
-  txfm_info->skip_txfm = best_pickmode.best_mode_skip_txfm;
+  if (try_palette) mi->palette_mode_info = search_state.best_pickmode.pmi;
+  mi->mode = search_state.best_pickmode.best_mode;
+  mi->ref_frame[0] = search_state.best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = search_state.best_pickmode.best_second_ref_frame;
+  txfm_info->skip_txfm = search_state.best_pickmode.best_mode_skip_txfm;
   if (!txfm_info->skip_txfm) {
     // For inter modes: copy blk_skip from best_pickmode, which is
     // defined for 8x8 blocks. If palette or intra mode was selected
     // as best then blk_skip is already copied into the ctx.
-    if (best_pickmode.best_mode >= INTRA_MODE_END)
-      memcpy(ctx->blk_skip, best_pickmode.blk_skip,
-             sizeof(best_pickmode.blk_skip[0]) * num_8x8_blocks);
+    if (search_state.best_pickmode.best_mode >= INTRA_MODE_END)
+      memcpy(ctx->blk_skip, search_state.best_pickmode.blk_skip,
+             sizeof(search_state.best_pickmode.blk_skip[0]) * num_8x8_blocks);
   }
   if (has_second_ref(mi)) {
     mi->comp_group_idx = 0;
@@ -3532,8 +4276,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
   }
 
-  if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
-    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+  if (reuse_inter_pred && search_state.best_pickmode.best_pred != NULL) {
+    PRED_BUFFER *const best_pred = search_state.best_pickmode.best_pred;
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
       aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
                         pd->dst.stride, bw, bh);
@@ -3546,22 +4290,25 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       cpi->denoiser.reset == 0) {
     AV1_DENOISER_DECISION decision = COPY_BLOCK;
     ctx->sb_skip_denoising = 0;
-    av1_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_costs_single,
-                                frame_mv, reuse_inter_pred, &best_pickmode);
+    av1_pickmode_ctx_den_update(
+        &ctx_den, zero_last_cost_orig, search_state.ref_costs_single,
+        search_state.frame_mv, reuse_inter_pred, &search_state.best_pickmode);
     av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
                          gf_temporal_ref);
     if (denoise_recheck_zeromv)
-      recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den,
-                                     yv12_mb, &best_rdc, &best_pickmode, bsize,
-                                     mi_row, mi_col);
-    best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
+      recheck_zeromv_after_denoising(
+          cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb,
+          &search_state.best_rdc, &search_state.best_pickmode, bsize, mi_row,
+          mi_col);
+    search_state.best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
   }
 #endif
 
   if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
     THR_MODES best_mode_idx =
-        mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
-    if (best_pickmode.best_ref_frame == INTRA_FRAME) {
+        mode_idx[search_state.best_pickmode.best_ref_frame]
+                [mode_offset(mi->mode)];
+    if (search_state.best_pickmode.best_ref_frame == INTRA_FRAME) {
       // Only consider the modes that are included in the intra_mode_list.
       int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
       for (int i = 0; i < intra_modes; i++) {
@@ -3571,7 +4318,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     } else {
       PREDICTION_MODE this_mode;
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-        update_thresh_freq_fact(cpi, x, bsize, best_pickmode.best_ref_frame,
+        update_thresh_freq_fact(cpi, x, bsize,
+                                search_state.best_pickmode.best_ref_frame,
                                 best_mode_idx, this_mode);
       }
     }
@@ -3582,53 +4330,14 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #else
   store_coding_context(x, ctx);
 #endif  // CONFIG_INTERNAL_STATS
-#if COLLECT_PICK_MODE_STAT
-  aom_usec_timer_mark(&ms_stat.timer2);
-  ms_stat.avg_block_times[bsize] += aom_usec_timer_elapsed(&ms_stat.timer2);
-  //
-  if ((mi_row + mi_size_high[bsize] >= (cpi->common.mi_params.mi_rows)) &&
-      (mi_col + mi_size_wide[bsize] >= (cpi->common.mi_params.mi_cols))) {
-    int i, j;
-    BLOCK_SIZE bss[5] = { BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
-                          BLOCK_128X128 };
-    int64_t total_time = 0l;
-    int32_t total_blocks = 0;
-
-    printf("\n");
-    for (i = 0; i < 5; i++) {
-      printf("BS(%d) Num %d, Avg_time %f:\n", bss[i],
-             ms_stat.num_blocks[bss[i]],
-             ms_stat.num_blocks[bss[i]] > 0
-                 ? (float)ms_stat.avg_block_times[bss[i]] /
-                       ms_stat.num_blocks[bss[i]]
-                 : 0);
-      total_time += ms_stat.avg_block_times[bss[i]];
-      total_blocks += ms_stat.num_blocks[bss[i]];
-      for (j = 0; j < MB_MODE_COUNT; j++) {
-        if (ms_stat.nonskipped_search_times[bss[i]][j] == 0) {
-          continue;
-        }
 
-        printf("  Mode %d, %d/%d tps %f\n", j,
-               ms_stat.num_nonskipped_searches[bss[i]][j],
-               ms_stat.num_searches[bss[i]][j],
-               ms_stat.num_nonskipped_searches[bss[i]][j] > 0
-                   ? (float)ms_stat.nonskipped_search_times[bss[i]][j] /
-                         ms_stat.num_nonskipped_searches[bss[i]][j]
-                   : 0l);
-        if (j >= INTER_MODE_START) {
-          printf("    Motion Search Time: %ld\n", ms_stat.ms_time[bss[i]][j]);
-          printf("    Filter Search Time: %ld\n", ms_stat.ifs_time[bss[i]][j]);
-          printf("    Model    RD   Time: %ld\n",
-                 ms_stat.model_rd_time[bss[i]][j]);
-          printf("    Tranfm Search Time: %ld\n", ms_stat.txfm_time[bss[i]][j]);
-        }
-      }
-      printf("\n");
-    }
-    printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
-  }
-  //
+#if COLLECT_PICK_MODE_STAT
+  aom_usec_timer_mark(&ms_stat.bsize_timer);
+  ms_stat.total_block_times[bsize] +=
+      aom_usec_timer_elapsed(&ms_stat.bsize_timer);
+  print_time(&ms_stat, bsize, cm->mi_params.mi_rows, cm->mi_params.mi_cols,
+             mi_row, mi_col);
 #endif  // COLLECT_PICK_MODE_STAT
-  *rd_cost = best_rdc;
+
+  *rd_cost = search_state.best_rdc;
 }
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 336333648..9c3d407ac 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -31,14 +31,14 @@
 #include "av1/encoder/k_means_template.h"
 #undef AV1_K_MEANS_DIM
 
-static int int_comparer(const void *a, const void *b) {
-  return (*(int *)a - *(int *)b);
+static int int16_comparer(const void *a, const void *b) {
+  return (*(int16_t *)a - *(int16_t *)b);
 }
 
-int av1_remove_duplicates(int *centroids, int num_centroids) {
+int av1_remove_duplicates(int16_t *centroids, int num_centroids) {
   int num_unique;  // number of unique centroids
   int i;
-  qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
+  qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer);
   // Remove duplicates.
   num_unique = 1;
   for (i = 1; i < num_centroids; ++i) {
@@ -189,14 +189,14 @@ static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
 // TODO(huisu): Try other schemes to improve compression.
 static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
                                                int n_cache, int n_colors,
-                                               int stride, int *centroids,
+                                               int stride, int16_t *centroids,
                                                int bit_depth) {
   if (n_cache <= 0) return;
   for (int i = 0; i < n_colors * stride; i += stride) {
-    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int min_diff = abs((int)centroids[i] - (int)color_cache[0]);
     int idx = 0;
     for (int j = 1; j < n_cache; ++j) {
-      const int this_diff = abs(centroids[i] - color_cache[j]);
+      const int this_diff = abs((int)centroids[i] - (int)color_cache[j]);
       if (this_diff < min_diff) {
         min_diff = this_diff;
         idx = j;
@@ -216,10 +216,10 @@ static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
  */
 static AOM_INLINE void palette_rd_y(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
-    uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids,
+    int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
     MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
     int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
     uint8_t *tx_type_map, int *beat_best_palette_rd,
     bool *do_header_rd_based_breakout, int discount_color_cost) {
@@ -324,14 +324,14 @@ static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
 // returns the best number of colors found.
 static AOM_INLINE int perform_top_color_palette_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
-    int start_n, int end_n, int step_size, bool do_header_rd_based_gating,
-    int *last_n_searched, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map, int discount_color_cost) {
-  int centroids[PALETTE_MAX_SIZE];
+    BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data,
+    int16_t *top_colors, int start_n, int end_n, int step_size,
+    bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+    int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) {
+  int16_t centroids[PALETTE_MAX_SIZE];
   int n = start_n;
   int top_color_winner = end_n;
   /* clang-format off */
@@ -371,15 +371,15 @@ static AOM_INLINE int perform_top_color_palette_search(
 // returns the best number of colors found.
 static AOM_INLINE int perform_k_means_palette_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lower_bound,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound,
     int upper_bound, int start_n, int end_n, int step_size,
     bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
     int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
     int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
     uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
     int data_points, int discount_color_cost) {
-  int centroids[PALETTE_MAX_SIZE];
+  int16_t centroids[PALETTE_MAX_SIZE];
   const int max_itr = 50;
   int n = start_n;
   int top_color_winner = end_n;
@@ -435,16 +435,19 @@ static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
   *step_size = AOMMAX(1, *max_n - *min_n);
 }
 
-static AOM_INLINE void fill_data_and_get_bounds(
-    const uint8_t *src, const int src_stride, const int rows, const int cols,
-    const int is_high_bitdepth, int *data, int *lower_bound, int *upper_bound) {
+static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src,
+                                                const int src_stride,
+                                                const int rows, const int cols,
+                                                const int is_high_bitdepth,
+                                                int16_t *data, int *lower_bound,
+                                                int *upper_bound) {
   if (is_high_bitdepth) {
     const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
     *lower_bound = *upper_bound = src_ptr[0];
     for (int r = 0; r < rows; ++r) {
       for (int c = 0; c < cols; ++c) {
         const int val = src_ptr[c];
-        data[c] = val;
+        data[c] = (int16_t)val;
         *lower_bound = AOMMIN(*lower_bound, val);
         *upper_bound = AOMMAX(*upper_bound, val);
       }
@@ -459,7 +462,7 @@ static AOM_INLINE void fill_data_and_get_bounds(
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
       const int val = src[c];
-      data[c] = val;
+      data[c] = (int16_t)val;
       *lower_bound = AOMMIN(*lower_bound, val);
       *upper_bound = AOMMAX(*upper_bound, val);
     }
@@ -487,7 +490,7 @@ int color_count_comp(const void *c1, const void *c2) {
 }
 
 static void find_top_colors(const int *const count_buf, int bit_depth,
-                            int n_colors, int *top_colors) {
+                            int n_colors, int16_t *top_colors) {
   // Top color array, serving as a priority queue if more than n_colors are
   // found.
   struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } };
@@ -527,7 +530,7 @@ static void find_top_colors(const int *const count_buf, int bit_depth,
 void av1_rd_pick_palette_intra_sby(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
     MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
     int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
     uint8_t *tx_type_map) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -562,8 +565,8 @@ void av1_rd_pick_palette_intra_sby(
 
   uint8_t *const color_map = xd->plane[0].color_index_map;
   if (colors_threshold > 1 && colors_threshold <= 64) {
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[PALETTE_MAX_SIZE];
+    int16_t *const data = x->palette_buffer->kmeans_data_buf;
+    int16_t centroids[PALETTE_MAX_SIZE];
     int lower_bound, upper_bound;
     fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data,
                              &lower_bound, &upper_bound);
@@ -575,7 +578,7 @@ void av1_rd_pick_palette_intra_sby(
     const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
 
     // Find the dominant colors, stored in top_colors[].
-    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    int16_t top_colors[PALETTE_MAX_SIZE] = { 0 };
     find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE),
                     top_colors);
 
@@ -740,7 +743,7 @@ void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
                                     MB_MODE_INFO *const best_mbmi,
                                     int64_t *best_rd, int *rate,
                                     int *rate_tokenonly, int64_t *distortion,
-                                    int *skippable) {
+                                    uint8_t *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
@@ -791,8 +794,8 @@ void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
     const int max_itr = 50;
     int lb_u, ub_u, val_u;
     int lb_v, ub_v, val_v;
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[2 * PALETTE_MAX_SIZE];
+    int16_t *const data = x->palette_buffer->kmeans_data_buf;
+    int16_t centroids[2 * PALETTE_MAX_SIZE];
 
     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
@@ -917,8 +920,8 @@ void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
-  int *const data = x->palette_buffer->kmeans_data_buf;
-  int centroids[2 * PALETTE_MAX_SIZE];
+  int16_t *const data = x->palette_buffer->kmeans_data_buf;
+  int16_t centroids[2 * PALETTE_MAX_SIZE];
   uint8_t *const color_map = xd->plane[1].color_index_map;
   int r, c;
   const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index 7d9a72f61..7da863a0c 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -28,10 +28,10 @@ struct macroblock;
 /*!\cond */
 #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
 
-void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
-void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
 /*!\endcond */
@@ -49,16 +49,17 @@ void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
  * \param[in]    k                  Number of clusters.
  * \param[in]    dim                Data dimension.
  *
- * \return Returns nothing, but saves each data's cluster index in indices.
+ * \remark Returns nothing, but saves each data's cluster index in \a indices.
  */
-static INLINE void av1_calc_indices(const int *data, const int *centroids,
-                                    uint8_t *indices, int n, int k, int dim) {
+static INLINE void av1_calc_indices(const int16_t *data,
+                                    const int16_t *centroids, uint8_t *indices,
+                                    int n, int k, int dim) {
   assert(n > 0);
   assert(k > 0);
   if (dim == 1) {
-    av1_calc_indices_dim1(data, centroids, indices, n, k);
+    av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k);
   } else if (dim == 2) {
-    av1_calc_indices_dim2(data, centroids, indices, n, k);
+    av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k);
   } else {
     assert(0 && "Untemplated k means dimension");
   }
@@ -79,12 +80,12 @@ static INLINE void av1_calc_indices(const int *data, const int *centroids,
  * \param[in]    dim                Data dimension.
  * \param[in]    max_itr            Maximum number of iterations to run.
  *
- * \return Returns nothing, but saves each cluster's centroid in centroids and
- * each data's cluster index in indices.
+ * \remark Returns nothing, but saves each cluster's centroid in centroids and
+ * each data's cluster index in \a indices.
  *
  * \attention The output centroids are rounded off to nearest integers.
  */
-static INLINE void av1_k_means(const int *data, int *centroids,
+static INLINE void av1_k_means(const int16_t *data, int16_t *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
   assert(n > 0);
@@ -110,7 +111,7 @@ static INLINE void av1_k_means(const int *data, int *centroids,
  * \attention The centroids should be rounded to integers before calling this
  * method.
  */
-int av1_remove_duplicates(int *centroids, int num_centroids);
+int av1_remove_duplicates(int16_t *centroids, int num_centroids);
 
 /*!\brief Checks what colors are in the color cache.
  *
@@ -186,7 +187,7 @@ void av1_rd_pick_palette_intra_sby(
     const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
     int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
     int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+    uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
     uint8_t *best_blk_skip, uint8_t *tx_type_map);
 
 /*!\brief Search for the best palette in the chroma plane.
@@ -201,7 +202,7 @@ void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
                                     MB_MODE_INFO *const best_mbmi,
                                     int64_t *best_rd, int *rate,
                                     int *rate_tokenonly, int64_t *distortion,
-                                    int *skippable);
+                                    uint8_t *skippable);
 
 /*!\brief Resets palette color map for chroma channels.
  */
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 4dd47e0fe..8d06bf51d 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <float.h>
+
 #include "aom_dsp/txfm_common.h"
 
 #include "av1/common/av1_common_int.h"
@@ -450,13 +452,10 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                            xd->block_ref_scale_factors[ref], num_planes);
     }
     // Predicted sample of inter mode (for Luma plane) cannot be reused if
-    // nonrd_check_partition_merge_mode or nonrd_check_partition_split speed
-    // feature is enabled, Since in such cases the buffer may not contain the
-    // predicted sample of best mode.
+    // nonrd_check_partition_split speed feature is enabled, Since in such cases
+    // the buffer may not contain the predicted sample of best mode.
     const int start_plane =
-        (cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
-         (!cpi->sf.rt_sf.nonrd_check_partition_merge_mode) &&
-         (!cpi->sf.rt_sf.nonrd_check_partition_split) &&
+        (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) &&
          cm->seq_params->bit_depth == AOM_BITS_8)
             ? 1
             : 0;
@@ -593,9 +592,9 @@ static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                : av1_log_block_var(cpi, x, bsize);
         mbmi->segment_id = energy;
       }
-      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+      x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
     } else if (aq_mode == COMPLEXITY_AQ) {
-      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+      x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
     } else if (aq_mode == CYCLIC_REFRESH_AQ) {
       // If segment is boosted, use rdmult for that segment.
       if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
@@ -724,7 +723,7 @@ void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
  * \param[in]    ctx            Structure to hold snapshot of coding context
                                 during the mode picking process
  *
- * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
  * is modified to store information about the best mode computed
  * in this function. The rd_cost struct is also updated with the RD stats
  * corresponding to the best mode found.
@@ -810,7 +809,7 @@ static AOM_INLINE void wait_for_top_right_sb(
  *                              chosen modes for the current block
  * \param[in]    best_rd        Upper bound of rd cost of a valid partition
  *
- * \return Nothing is returned. Instead, the chosen modes and contexts necessary
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
  * for reconstruction are stored in ctx, the rate-distortion stats are stored in
  * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
  * signalled by an INT64_MAX rd_cost->rdcost.
@@ -1391,7 +1390,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
  *                         chosen modes for the current block
  * \param[in]    rate      Pointer to the total rate for the current block
  *
- * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
  * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes
  * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0].
  */
@@ -1554,7 +1553,7 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
  *                         partitions and mode info for the current block
  * \param[in]    rate      Pointer to the total rate for the current block
  *
- * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
  * will be updated in the pixel buffers in td->mb.e_mbd.
  */
 static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
@@ -1736,7 +1735,7 @@ reference for future sub-partitions
 * \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
 partitions and mode info for the current block
 *
-* \return Nothing is returned. The pc_tree struct is modified to store the
+* \remark Nothing is returned. The pc_tree struct is modified to store the
 * picked partition and modes. The rate and dist are also updated with those
 * corresponding to the best partition found.
 */
@@ -2127,7 +2126,7 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   }
   if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
       !cpi->rc.rtc_external_ratectrl && cm->seg.enabled)
-    av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize);
+    av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run);
   // TODO(Ravi/Remya): Move this copy function to a better logical place
   // This function will copy the best mode information from block
   // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
@@ -2141,6 +2140,46 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 #endif
 }
 
+static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              BLOCK_SIZE bsize) {
+  // Force zero MV skip based on SB level decision
+  if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb;
+
+  // For blocks of size equal to superblock size, the decision would have been
+  // already done at superblock level. Hence zeromv-skip decision is skipped.
+  const AV1_COMMON *const cm = &cpi->common;
+  if (bsize == cm->seq_params->sb_size) return 0;
+
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const unsigned int thresh_exit_part_y =
+      cpi->zeromv_skip_thresh_exit_part[bsize];
+  const unsigned int thresh_exit_part_uv =
+      CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
+  const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y,
+                                                        thresh_exit_part_uv,
+                                                        thresh_exit_part_uv };
+  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, LAST_FRAME);
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+        p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+    assert(plane < MAX_MB_PLANE);
+    if (plane_sad >= thresh_exit_part[plane]) return 0;
+  }
+  return 1;
+}
+
 /*!\brief Top level function to pick block mode for non-RD optimized case
  *
  * \ingroup partition_search
@@ -2168,7 +2207,7 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
  * \param[in]    ctx            Pointer to structure holding coding contexts and
  *                              chosen modes for the current block
  *
- * \return Nothing is returned. Instead, the chosen modes and contexts necessary
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
  * for reconstruction are stored in ctx, the rate-distortion stats are stored in
  * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
  * signalled by an INT64_MAX rd_cost->rdcost.
@@ -2179,7 +2218,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
                                 PICK_MODE_CONTEXT *ctx) {
   // For nonrd mode, av1_set_offsets is already called at the superblock level
   // in encode_nonrd_sb when we determine the partitioning.
-  if (bsize != cpi->common.seq_params->sb_size) {
+  if (bsize != cpi->common.seq_params->sb_size ||
+      cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
     av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
   }
   assert(x->last_set_offsets_loc.mi_row == mi_row &&
@@ -2215,7 +2255,11 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
   }
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-  if (!x->force_zeromv_skip) {
+
+  x->force_zeromv_skip_for_blk =
+      get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
+
+  if (!x->force_zeromv_skip_for_blk) {
     x->source_variance = av1_get_perpixel_variance_facade(
         cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
   }
@@ -2253,6 +2297,12 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
 #endif
   }
   if (cpi->sf.rt_sf.skip_cdef_sb) {
+    // cdef_strength is initialized to 1 which means skip_cdef, and is updated
+    // here. Check to see is skipping cdef is allowed.
+    const int allow_cdef_skipping =
+        cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
+        !(x->color_sensitivity[0] || x->color_sensitivity[1]);
+
     // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
     // the block size.
     const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
@@ -2262,12 +2312,11 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
         get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
     // Do not skip if intra or new mv is picked, or color sensitivity is set.
     // Never skip on slide/scene change.
-    mi_sb[0]->skip_cdef_curr_sb =
-        mi_sb[0]->skip_cdef_curr_sb && !cpi->rc.high_source_sad &&
-        !(x->color_sensitivity[0] || x->color_sensitivity[1]) &&
+    mi_sb[0]->cdef_strength =
+        mi_sb[0]->cdef_strength && allow_cdef_skipping &&
         !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
     // Store in the pickmode context.
-    ctx->mic.skip_cdef_curr_sb = mi_sb[0]->skip_cdef_curr_sb;
+    ctx->mic.cdef_strength = mi_sb[0]->cdef_strength;
   }
   x->rdmult = orig_rdmult;
   ctx->rd_stats.rate = rd_cost->rate;
@@ -2278,6 +2327,332 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
 #endif
 }
 
+static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
+                               TileDataEnc *const tile_data,
+                               TileInfo *const tile_info, TokenExtra **tp,
+                               MACROBLOCK *const x, MACROBLOCKD *const xd,
+                               const CommonModeInfoParams *const mi_params,
+                               const int mi_row, const int mi_col,
+                               const BLOCK_SIZE bsize, const int pl,
+                               PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int hbs = mi_size_wide[bsize] / 2;
+  if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows ||
+      mi_col + mi_size_wide[bsize] >= mi_params->mi_cols)
+    return 0;
+  if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0;
+  if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0;
+
+  // Do not try split partition when the source sad is small, or
+  // the prediction residual is small.
+  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, LAST_FRAME);
+  const int num_planes = av1_num_planes(cm);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+  av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes);
+  int block_sad = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+        p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+    block_sad += plane_sad;
+  }
+  const int blk_pix = block_size_wide[bsize] * block_size_high[bsize];
+  const int block_avg_sad = block_sad / blk_pix;
+  // TODO(chengchen): find a proper threshold. It might change according to
+  // q as well.
+  const int threshold = 25;
+  if (block_avg_sad < threshold) return 0;
+
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS split_rdc, none_rdc;
+  av1_invalid_rd_stats(&split_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Calculate rdcost for none partition
+  pc_tree->partitioning = PARTITION_NONE;
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  if (!pc_tree->none) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+  } else {
+    av1_reset_pmc(pc_tree->none);
+  }
+  pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                      pc_tree->none);
+  none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+  none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  // Calculate rdcost for split partition
+  pc_tree->partitioning = PARTITION_SPLIT;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  av1_init_rd_stats(&split_rdc);
+  split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+  if (subsize >= BLOCK_8X8) {
+    split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4);
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (!pc_tree->split[i]) {
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    }
+    pc_tree->split[i]->index = i;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+    RD_STATS block_rdc;
+    av1_invalid_rd_stats(&block_rdc);
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= mi_params->mi_rows) ||
+        (mi_col + x_idx >= mi_params->mi_cols))
+      continue;
+    xd->above_txfm_context =
+        cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+    if (!pc_tree->split[i]->none) {
+      pc_tree->split[i]->none =
+          av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+    } else {
+      av1_reset_pmc(pc_tree->split[i]->none);
+    }
+    pc_tree->split[i]->partitioning = PARTITION_NONE;
+    pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                        &block_rdc, subsize, pc_tree->split[i]->none);
+    split_rdc.rate += block_rdc.rate;
+    split_rdc.dist += block_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &split_rdc);
+    if (none_rdc.rdcost < split_rdc.rdcost) break;
+    if (i != SUB_PARTITIONS_SPLIT - 1)
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                     subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+  }
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+  split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+  const int split = split_rdc.rdcost < none_rdc.rdcost;
+
+  return split;
+}
+
+// Returns if SPLIT partitions should be evaluated
+static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x,
+                               const PC_TREE *pc_tree, const RD_STATS *none_rdc,
+                               const CommonModeInfoParams *mi_params,
+                               int mi_row, int mi_col, int hbs,
+                               BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_larger_qindex = cm->quant_params.base_qindex > 100;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  bool do_split =
+      (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3)
+          ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64))
+          : true;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
+      cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+      cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) ||
+      !none_rdc->skip_txfm)
+    return do_split;
+
+  const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+  // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm
+  // would have been populated based on Hadamard transform and skip_txfm flag is
+  // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8
+  // and 16x16 blocks.
+  // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may
+  // not be reliable. Hence SPLIT evaluation is disabled only at lower
+  // quantizers for blocks >= 32x32.
+  if ((!use_model_yrd_large) || (!is_larger_qindex)) return false;
+
+  // Use residual statistics to decide if SPLIT partition should be evaluated
+  // for 32x32 blocks. The pruning logic is avoided for larger block size to
+  // avoid the visual artifacts
+  if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+    assert(subsize < BLOCK_SIZES_ALL);
+    double min_per_pixel_error = DBL_MAX;
+    double max_per_pixel_error = 0.;
+    int i;
+    for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      const int x_idx = (i & 1) * hbs;
+      const int y_idx = (i >> 1) * hbs;
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols)) {
+        break;
+      }
+
+      // Populate the appropriate buffer pointers.
+      // Pass scale factors as NULL as the base pointer of the block would have
+      // been calculated appropriately.
+      struct buf_2d src_split_buf_2d, pred_split_buf_2d;
+      const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src;
+      setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf,
+                       src_none_buf_2d->width, src_none_buf_2d->height,
+                       src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+      const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst;
+      setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf,
+                       pred_none_buf_2d->width, pred_none_buf_2d->height,
+                       pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+
+      unsigned int curr_uint_mse;
+      const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf(
+          src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf,
+          pred_split_buf_2d.stride, &curr_uint_mse);
+      const double curr_per_pixel_error =
+          sqrt((double)curr_uint_var / block_size_wide[subsize] /
+               block_size_high[subsize]);
+      if (curr_per_pixel_error < min_per_pixel_error)
+        min_per_pixel_error = curr_per_pixel_error;
+      if (curr_per_pixel_error > max_per_pixel_error)
+        max_per_pixel_error = curr_per_pixel_error;
+    }
+
+    // Prune based on residual statistics only if all the sub-partitions are
+    // valid.
+    if (i == SUB_PARTITIONS_SPLIT) {
+      if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false;
+    }
+  }
+
+  return do_split;
+}
+
+static void try_merge(AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                      TokenExtra **tp, const int mi_row, const int mi_col,
+                      const BLOCK_SIZE bsize, PC_TREE *const pc_tree,
+                      const PARTITION_TYPE partition, const BLOCK_SIZE subsize,
+                      const int pl) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  bool do_split = false;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS split_rdc, none_rdc;
+  av1_invalid_rd_stats(&split_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  pc_tree->partitioning = PARTITION_NONE;
+  if (!pc_tree->none) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+  } else {
+    av1_reset_pmc(pc_tree->none);
+  }
+  pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                      pc_tree->none);
+  none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+  none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+      none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
+    do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row,
+                                  mi_col, hbs, bsize, partition);
+    if (do_split) {
+      av1_init_rd_stats(&split_rdc);
+      split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+        RD_STATS block_rdc;
+        av1_invalid_rd_stats(&block_rdc);
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
+          continue;
+        xd->above_txfm_context =
+            cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+        if (!pc_tree->split[i]->none) {
+          pc_tree->split[i]->none =
+              av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        } else {
+          av1_reset_pmc(pc_tree->split[i]->none);
+        }
+        pc_tree->split[i]->partitioning = PARTITION_NONE;
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                            &block_rdc, subsize, pc_tree->split[i]->none);
+        // TODO(yunqingwang): The rate here did not include the cost of
+        // signaling PARTITION_NONE token in the sub-blocks.
+        split_rdc.rate += block_rdc.rate;
+        split_rdc.dist += block_rdc.dist;
+
+        av1_rd_cost_update(x->rdmult, &split_rdc);
+
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          break;
+        }
+
+        if (i != SUB_PARTITIONS_SPLIT - 1)
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
+                         1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
+                         NULL);
+      }
+      av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+      split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+    }
+  }
+
+  if (none_rdc.rdcost < split_rdc.rdcost) {
+    /* Predicted samples can not be reused for PARTITION_NONE since same
+     * buffer is being used to store the reconstructed samples of
+     * PARTITION_SPLIT block. */
+    if (do_split) x->reuse_inter_pred = false;
+
+    mib[0]->bsize = bsize;
+    pc_tree->partitioning = PARTITION_NONE;
+    encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   pc_tree->none, NULL);
+  } else {
+    mib[0]->bsize = subsize;
+    pc_tree->partitioning = PARTITION_SPLIT;
+    /* Predicted samples can not be reused for PARTITION_SPLIT since same
+     * buffer is being used to write the reconstructed samples. */
+    // TODO(Cherma): Store and reuse predicted samples generated by
+    // encode_b_nonrd() in DRY_RUN_NORMAL mode.
+    x->reuse_inter_pred = false;
+
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
+        continue;
+
+      // Note: We don't reset pc_tree->split[i]->none here because it
+      // could contain results from the additional check. Instead, it is
+      // reset before we enter the nonrd_check_partition_merge_mode
+      // condition.
+      if (!pc_tree->split[i]->none) {
+        pc_tree->split[i]->none =
+            av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+      }
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+    }
+  }
+}
+
 // Evaluate if the sub-partitions can be merged directly into a large partition
 // without calculating the RD cost.
 static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
@@ -2495,7 +2870,7 @@ MI_SIZE
 * \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
 partitions and mode info for the current block
 *
-* \return Nothing is returned. The pc_tree struct is modified to store the
+* \remark Nothing is returned. The pc_tree struct is modified to store the
 * picked partition and modes.
 */
 void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
@@ -2512,9 +2887,9 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
   assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
   const int bs = mi_size_wide[bsize];
   const int hbs = bs / 2;
-  const PARTITION_TYPE partition =
-      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
-                           : PARTITION_NONE;
+  PARTITION_TYPE partition = (bsize >= BLOCK_8X8)
+                                 ? get_partition(cm, mi_row, mi_col, bsize)
+                                 : PARTITION_NONE;
   BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   assert(subsize <= BLOCK_LARGEST);
   const int pl = (bsize >= BLOCK_8X8)
@@ -2528,8 +2903,6 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
-  pc_tree->partitioning = partition;
-
   xd->above_txfm_context =
       cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
   xd->left_txfm_context =
@@ -2538,6 +2911,23 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
 
+  x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd;
+
+  int change_none_to_split = 0;
+  if (partition == PARTITION_NONE &&
+      cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+    change_none_to_split =
+        try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params,
+                            mi_row, mi_col, bsize, pl, pc_tree);
+    if (change_none_to_split) {
+      partition = PARTITION_SPLIT;
+      subsize = get_partition_subsize(bsize, partition);
+      assert(subsize <= BLOCK_LARGEST);
+    }
+  }
+
+  pc_tree->partitioning = partition;
+
   switch (partition) {
     case PARTITION_NONE:
       if (!pc_tree->none) {
@@ -2545,73 +2935,10 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
       } else {
         av1_reset_pmc(pc_tree->none);
       }
-      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_split_check(bsize) &&
-          !frame_is_intra_only(cm)) {
-        RD_STATS split_rdc, none_rdc, block_rdc;
-        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-
-        av1_init_rd_stats(&split_rdc);
-        av1_invalid_rd_stats(&none_rdc);
-
-        av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
-                            pc_tree->none);
-        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-
-        for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
-          av1_invalid_rd_stats(&block_rdc);
-          const int x_idx = (i & 1) * hbs;
-          const int y_idx = (i >> 1) * hbs;
-          if (mi_row + y_idx >= mi_params->mi_rows ||
-              mi_col + x_idx >= mi_params->mi_cols)
-            continue;
-          xd->above_txfm_context =
-              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
-          xd->left_txfm_context =
-              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
-          pc_tree->split[i]->partitioning = PARTITION_NONE;
-          pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                              &block_rdc, subsize, pc_tree->split[i]->none);
-          split_rdc.rate += block_rdc.rate;
-          split_rdc.dist += block_rdc.dist;
-
-          encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
-                         1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
-                         NULL);
-        }
-        split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
-        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
-        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-
-        if (none_rdc.rdcost < split_rdc.rdcost) {
-          mib[0]->bsize = bsize;
-          pc_tree->partitioning = PARTITION_NONE;
-          encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
-                         partition, pc_tree->none, NULL);
-        } else {
-          mib[0]->bsize = subsize;
-          pc_tree->partitioning = PARTITION_SPLIT;
-          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
-            const int x_idx = (i & 1) * hbs;
-            const int y_idx = (i >> 1) * hbs;
-            if (mi_row + y_idx >= mi_params->mi_rows ||
-                mi_col + x_idx >= mi_params->mi_cols)
-              continue;
-            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
-                           mi_col + x_idx, 0, subsize, PARTITION_NONE,
-                           pc_tree->split[i]->none, NULL);
-          }
-        }
-
-      } else {
-        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                            bsize, pc_tree->none);
-        encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
-                       partition, pc_tree->none, NULL);
-      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize,
+                          pc_tree->none);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                     partition, pc_tree->none, NULL);
       break;
     case PARTITION_VERT:
       for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
@@ -2664,107 +2991,8 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
       if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
           av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
           !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
-        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-        RD_STATS split_rdc, none_rdc;
-        av1_invalid_rd_stats(&split_rdc);
-        av1_invalid_rd_stats(&none_rdc);
-        av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        xd->above_txfm_context =
-            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-        xd->left_txfm_context =
-            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-        pc_tree->partitioning = PARTITION_NONE;
-        if (!pc_tree->none) {
-          pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
-        } else {
-          av1_reset_pmc(pc_tree->none);
-        }
-        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
-                            pc_tree->none);
-        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
-            none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
-          const int is_larger_qindex = cm->quant_params.base_qindex > 100;
-          const int do_split =
-              (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3)
-                  ? (bsize <= BLOCK_32X32 ||
-                     (is_larger_qindex && bsize <= BLOCK_64X64))
-                  : 1;
-          if (do_split) {
-            av1_init_rd_stats(&split_rdc);
-            split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
-            for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
-              RD_STATS block_rdc;
-              av1_invalid_rd_stats(&block_rdc);
-              int x_idx = (i & 1) * hbs;
-              int y_idx = (i >> 1) * hbs;
-              if ((mi_row + y_idx >= mi_params->mi_rows) ||
-                  (mi_col + x_idx >= mi_params->mi_cols))
-                continue;
-              xd->above_txfm_context =
-                  cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
-              xd->left_txfm_context = xd->left_txfm_context_buffer +
-                                      ((mi_row + y_idx) & MAX_MIB_MASK);
-              if (!pc_tree->split[i]->none) {
-                pc_tree->split[i]->none =
-                    av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
-              } else {
-                av1_reset_pmc(pc_tree->split[i]->none);
-              }
-              pc_tree->split[i]->partitioning = PARTITION_NONE;
-              pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx,
-                                  mi_col + x_idx, &block_rdc, subsize,
-                                  pc_tree->split[i]->none);
-              // TODO(yunqingwang): The rate here did not include the cost of
-              // signaling PARTITION_NONE token in the sub-blocks.
-              split_rdc.rate += block_rdc.rate;
-              split_rdc.dist += block_rdc.dist;
-
-              av1_rd_cost_update(x->rdmult, &split_rdc);
-
-              if (none_rdc.rdcost < split_rdc.rdcost) {
-                break;
-              }
-
-              encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
-                             mi_col + x_idx, 1, subsize, PARTITION_NONE,
-                             pc_tree->split[i]->none, NULL);
-            }
-            av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-            split_rdc.rdcost =
-                RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
-          }
-        }
-        if (none_rdc.rdcost < split_rdc.rdcost) {
-          mib[0]->bsize = bsize;
-          pc_tree->partitioning = PARTITION_NONE;
-          encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
-                         partition, pc_tree->none, NULL);
-        } else {
-          mib[0]->bsize = subsize;
-          pc_tree->partitioning = PARTITION_SPLIT;
-          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
-            int x_idx = (i & 1) * hbs;
-            int y_idx = (i >> 1) * hbs;
-            if ((mi_row + y_idx >= mi_params->mi_rows) ||
-                (mi_col + x_idx >= mi_params->mi_cols))
-              continue;
-
-            // Note: We don't reset pc_tree->split[i]->none here because it
-            // could contain results from the additional check. Instead, it is
-            // reset before we enter the nonrd_check_partition_merge_mode
-            // condition.
-            if (!pc_tree->split[i]->none) {
-              pc_tree->split[i]->none =
-                  av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
-            }
-            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
-                           mi_col + x_idx, 0, subsize, PARTITION_NONE,
-                           pc_tree->split[i]->none, NULL);
-          }
-        }
+        try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree,
+                  partition, subsize, pl);
       } else {
         for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
           int x_idx = (i & 1) * hbs;
@@ -2779,15 +3007,17 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
               mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
         }
 
-        // Note: Palette, cfl are not supported.
-        if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf &&
-            cpi->sf.rt_sf.partition_direct_merging &&
-            mode_costs->partition_cost[pl][PARTITION_NONE] <
-                mode_costs->partition_cost[pl][PARTITION_SPLIT] &&
-            (mi_row + bs <= mi_params->mi_rows) &&
-            (mi_col + bs <= mi_params->mi_cols)) {
-          direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col,
-                                   bsize);
+        if (!change_none_to_split) {
+          // Note: Palette, cfl are not supported.
+          if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf &&
+              cpi->sf.rt_sf.partition_direct_merging &&
+              mode_costs->partition_cost[pl][PARTITION_NONE] <
+                  mode_costs->partition_cost[pl][PARTITION_SPLIT] &&
+              (mi_row + bs <= mi_params->mi_rows) &&
+              (mi_col + bs <= mi_params->mi_cols)) {
+            direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col,
+                                     bsize);
+          }
         }
       }
       break;
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index f66fd08ba..89c1a7956 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -1767,11 +1767,10 @@ int evaluate_ab_partition_based_on_split(
   // Threshold for number of winners
   // Conservative pruning for high quantizers
   const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
-  int sub_part_win = (rect_part_win_info == NULL)
-                         ? (pc_tree->partitioning == rect_part)
-                         : (rect_part == PARTITION_HORZ)
-                               ? rect_part_win_info->rect_part_win[HORZ]
-                               : rect_part_win_info->rect_part_win[VERT];
+  int sub_part_win =
+      (rect_part_win_info == NULL)    ? (pc_tree->partitioning == rect_part)
+      : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ]
+                                      : rect_part_win_info->rect_part_win[VERT];
   num_win += (sub_part_win) ? 1 : 0;
   if (pc_tree->split[split_idx1]) {
     num_win +=
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 0dd09398c..1dd9721c2 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -173,7 +173,7 @@ static double calc_correction_factor(double err_per_mb, int q) {
 
 // Based on history adjust expectations of bits per macroblock.
 static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
-  TWO_PASS *twopass = &cpi->ppi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
 
   // Based on recent history adjust expectations of bits per macroblock.
@@ -212,7 +212,7 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
   }
 
   int err_estimate = p_rc->rate_error_estimate;
-  int64_t bits_left = cpi->ppi->twopass.bits_left;
+  int64_t bits_left = twopass->bits_left;
   int64_t total_actual_bits = p_rc->total_actual_bits;
   int64_t bits_off_target = p_rc->vbr_bits_off_target;
   double rolling_arf_group_actual_bits =
@@ -231,8 +231,8 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
                                               : p_rc->total_actual_bits;
   bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target
                                             : p_rc->vbr_bits_off_target;
-  bits_left = simulate_parallel_frame ? p_rc->temp_bits_left
-                                      : cpi->ppi->twopass.bits_left;
+  bits_left =
+      simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left;
   rolling_arf_group_target_bits =
       (double)(simulate_parallel_frame
                    ? p_rc->temp_rolling_arf_group_target_bits
@@ -799,11 +799,10 @@ static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
   }
 
   // Clamp odd edge cases.
-  total_group_bits = (total_group_bits < 0)
-                         ? 0
-                         : (total_group_bits > twopass->kf_group_bits)
-                               ? twopass->kf_group_bits
-                               : total_group_bits;
+  total_group_bits = (total_group_bits < 0) ? 0
+                     : (total_group_bits > twopass->kf_group_bits)
+                         ? twopass->kf_group_bits
+                         : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
   if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
@@ -1876,7 +1875,7 @@ static int find_regions_index(const REGIONS *regions, int num_regions,
  * \param[in]    max_gop_length   Maximum length of the GF group
  * \param[in]    max_intervals    Maximum number of intervals to decide
  *
- * \return Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
+ * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
  * changed to store the decided GF group lengths.
  */
 static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
@@ -2143,7 +2142,7 @@ static void correct_frames_to_key(AV1_COMP *cpi) {
  *
  * \param[in]    cpi             Top-level encoder structure
  *
- * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
  */
 static void define_gf_group_pass0(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2365,7 +2364,7 @@ static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only,
 
 #if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
-  // We are more agressive about correcting for sections
+  // We are more aggressive about correcting for sections
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
@@ -2432,7 +2431,7 @@ static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only,
  * \param[in]    is_final_pass   Whether this is the final pass for the
  *                               GF group, or a trial (non-zero)
  *
- * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
  */
 static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params,
                             int is_final_pass) {
@@ -3147,8 +3146,6 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
  *
  * \param[in]    cpi              Top-level encoder structure
  * \param[in]    this_frame       Pointer to first pass stats
- *
- * \return Nothing is returned.
  */
 static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3222,8 +3219,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     rc->frames_to_key = kf_cfg->key_freq_max;
   }
 
-  rc->frames_to_fwd_kf = kf_cfg->fwd_kf_dist;
-
   if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
 
   // If there is a max kf interval set by the user we must obey it.
@@ -3440,13 +3435,13 @@ static void process_first_pass_stats(AV1_COMP *cpi,
 
   if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
       cpi->gf_frame_index == 0 && total_stats &&
-      cpi->ppi->twopass.stats_buf_ctx->total_left_stats) {
+      twopass->stats_buf_ctx->total_left_stats) {
     if (cpi->ppi->lap_enabled) {
       /*
        * Accumulate total_stats using available limited number of stats,
        * and assign it to total_left_stats.
        */
-      *cpi->ppi->twopass.stats_buf_ctx->total_left_stats = *total_stats;
+      *twopass->stats_buf_ctx->total_left_stats = *total_stats;
     }
     // Special case code for first frame.
     const int section_target_bandwidth = get_section_target_bandwidth(cpi);
@@ -3473,8 +3468,7 @@ static void process_first_pass_stats(AV1_COMP *cpi,
     p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME];
   }
 
-  if (cpi->twopass_frame.stats_in <
-      cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+  if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) {
     *this_frame = *cpi->twopass_frame.stats_in;
     ++cpi->twopass_frame.stats_in;
   }
@@ -3495,8 +3489,8 @@ static void setup_target_rate(AV1_COMP *cpi) {
   rc->base_frame_target = target_rate;
 }
 
-static void mark_flashes(FIRSTPASS_STATS *first_stats,
-                         FIRSTPASS_STATS *last_stats) {
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+                      FIRSTPASS_STATS *last_stats) {
   FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
   while (this_stats < last_stats - 1) {
     next_stats = this_stats + 1;
@@ -3515,8 +3509,8 @@ static void mark_flashes(FIRSTPASS_STATS *first_stats,
 }
 
 // Estimate the noise variance of each frame from the first pass stats
-static void estimate_noise(FIRSTPASS_STATS *first_stats,
-                           FIRSTPASS_STATS *last_stats) {
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats) {
   FIRSTPASS_STATS *this_stats, *next_stats;
   double C1, C2, C3, noise;
   for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
@@ -3604,8 +3598,8 @@ static void estimate_noise(FIRSTPASS_STATS *first_stats,
 }
 
 // Estimate correlation coefficient of each frame with its previous frame.
-static void estimate_coeff(FIRSTPASS_STATS *first_stats,
-                           FIRSTPASS_STATS *last_stats) {
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats) {
   FIRSTPASS_STATS *this_stats;
   for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
     const double C =
@@ -3650,6 +3644,16 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
 
   if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return;
 
+  // Check forced key frames.
+  const int frames_to_next_forced_key = detect_app_forced_key(cpi);
+  if (frames_to_next_forced_key == 0) {
+    rc->frames_to_key = 0;
+    frame_flags &= FRAMEFLAGS_KEY;
+  } else if (frames_to_next_forced_key > 0 &&
+             frames_to_next_forced_key < rc->frames_to_key) {
+    rc->frames_to_key = frames_to_next_forced_key;
+  }
+
   assert(cpi->twopass_frame.stats_in != NULL);
   const int update_type = gf_group->update_type[cpi->gf_frame_index];
   frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
@@ -3721,9 +3725,16 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
                                           oxcf->algo_cfg.arnr_max_frames / 2)
             : MAX_GF_LENGTH_LAP;
 
+    // Handle forward key frame when enabled.
+    if (oxcf->kf_cfg.fwd_kf_dist > 0)
+      max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length);
+
     // Use the provided gop size in low delay setting
     if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval;
 
+    // Limit the max gop length for the last gop in 1 pass setting.
+    max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key);
+
     // Identify regions if needed.
     // TODO(bohanli): identify regions for all stats available.
     if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
@@ -3741,12 +3752,12 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
       p_rc->frames_till_regions_update = rest_frames;
 
       if (cpi->ppi->lap_enabled) {
-        mark_flashes(twopass->stats_buf_ctx->stats_in_start,
-                     twopass->stats_buf_ctx->stats_in_end);
-        estimate_noise(twopass->stats_buf_ctx->stats_in_start,
-                       twopass->stats_buf_ctx->stats_in_end);
-        estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
-                       twopass->stats_buf_ctx->stats_in_end);
+        av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                         twopass->stats_buf_ctx->stats_in_end);
+        av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                           twopass->stats_buf_ctx->stats_in_end);
+        av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                           twopass->stats_buf_ctx->stats_in_end);
         av1_identify_regions(cpi->twopass_frame.stats_in, rest_frames,
                              (rc->frames_since_key == 0), p_rc->regions,
                              &p_rc->num_regions);
@@ -3911,12 +3922,12 @@ void av1_init_second_pass(AV1_COMP *cpi) {
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
-  mark_flashes(twopass->stats_buf_ctx->stats_in_start,
-               twopass->stats_buf_ctx->stats_in_end);
-  estimate_noise(twopass->stats_buf_ctx->stats_in_start,
-                 twopass->stats_buf_ctx->stats_in_end);
-  estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
-                 twopass->stats_buf_ctx->stats_in_end);
+  av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                   twopass->stats_buf_ctx->stats_in_end);
+  av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end);
+  av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end);
 
   stats = twopass->stats_buf_ctx->total_stats;
 
@@ -3934,7 +3945,7 @@ void av1_init_second_pass(AV1_COMP *cpi) {
       (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
 
 #if CONFIG_BITRATE_ACCURACY
-  av1_vbr_rc_init(&cpi->vbr_rc_info, cpi->ppi->twopass.bits_left,
+  av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left,
                   (int)round(stats->count));
 #endif
 
@@ -4031,6 +4042,8 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
     const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
     if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) {
       FIRSTPASS_STATS this_frame;
+      assert(cpi->twopass_frame.stats_in >
+             twopass->stats_buf_ctx->stats_in_start);
       --cpi->twopass_frame.stats_in;
       if (cpi->ppi->lap_enabled) {
         input_stats_lap(twopass, &cpi->twopass_frame, &this_frame);
@@ -4038,8 +4051,7 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
         input_stats(twopass, &cpi->twopass_frame, &this_frame);
       }
     } else if (cpi->ppi->lap_enabled) {
-      cpi->twopass_frame.stats_in =
-          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start;
     }
   }
 
diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h
index 6234623a5..a75be1afa 100644
--- a/av1/encoder/pass2_strategy.h
+++ b/av1/encoder/pass2_strategy.h
@@ -21,7 +21,6 @@ struct EncodeFrameParams;
 
 #include "av1/encoder/encoder.h"
 
-/*!\endcond */
 /*!
  * \brief accumulated stats and features in a gf group
  */
@@ -60,7 +59,7 @@ typedef struct {
   double frame_sr_coded_error;
   /*!\endcond */
 } GF_FRAME_STATS;
-/*!cond */
+/*!\cond */
 
 void av1_init_second_pass(struct AV1_COMP *cpi);
 
@@ -83,7 +82,7 @@ void av1_init_single_pass_lap(AV1_COMP *cpi);
  * \param[in]    frame_params  Per frame encoding parameters
  * \param[in]    frame_flags   Frame type and coding flags
  *
- * \return No return but analyses first pass stats and assigns a target
+ * \remark No return but analyses first pass stats and assigns a target
  *         number of bits to the current frame and a target Q range.
  */
 void av1_get_second_pass_params(struct AV1_COMP *cpi,
@@ -99,7 +98,7 @@ void av1_get_second_pass_params(struct AV1_COMP *cpi,
  *
  * \param[in]    cpi       Top - level encoder instance structure
  *
- * \return No return value but this function updates various rate control
+ * \remark No return value but this function updates various rate control
  *         related data structures that for example track overshoot and
  *         undershoot.
  */
@@ -121,7 +120,7 @@ void av1_twopass_postencode_update(struct AV1_COMP *cpi);
  *                            uni-directional group.
  * \param[in]   gf_group_bits Bits available to be allocated.
  *
- * \return No return but updates the rate control and group data structures
+ * \remark No return but updates the rate control and group data structures
  *         to reflect the allocation of bits.
  */
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
@@ -149,6 +148,13 @@ void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
                           int total_frames, int offset, REGIONS *regions,
                           int *total_regions);
 
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+                      FIRSTPASS_STATS *last_stats);
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats);
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+                        FIRSTPASS_STATS *last_stats);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index adbb07c87..22a455759 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -22,6 +22,7 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/pickcdef.h"
+#include "av1/encoder/mcomp.h"
 
 // Get primary and secondary filter strength for the given strength index and
 // search method
@@ -41,17 +42,29 @@ static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
   if (pick_method == CDEF_FULL_SEARCH) return;
 
   switch (pick_method) {
-    case CDEF_FAST_SEARCH_LVL1: *pri_strength = priconv_lvl1[pri_idx]; break;
-    case CDEF_FAST_SEARCH_LVL2: *pri_strength = priconv_lvl2[pri_idx]; break;
+    case CDEF_FAST_SEARCH_LVL1:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1);
+      *pri_strength = priconv_lvl1[pri_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL2:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+      *pri_strength = priconv_lvl2[pri_idx];
+      break;
     case CDEF_FAST_SEARCH_LVL3:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+      assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
       *pri_strength = priconv_lvl2[pri_idx];
       *sec_strength = secconv_lvl3[sec_idx];
       break;
     case CDEF_FAST_SEARCH_LVL4:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+      assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
       *pri_strength = priconv_lvl4[pri_idx];
       *sec_strength = secconv_lvl3[sec_idx];
       break;
     case CDEF_FAST_SEARCH_LVL5:
+      assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+      assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5);
       *pri_strength = priconv_lvl5[pri_idx];
       *sec_strength = secconv_lvl5[sec_idx];
       break;
@@ -210,29 +223,6 @@ static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
   return best_tot_mse;
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
-static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src,
-                                int src_voffset, int src_hoffset, int sstride,
-                                int vsize, int hsize) {
-  int r;
-  const uint16_t *src16 = CONVERT_TO_SHORTPTR((uint8_t *)src);
-  const uint16_t *base = &src16[src_voffset * sstride + src_hoffset];
-  for (r = 0; r < vsize; r++)
-    memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
-}
-#endif
-
-static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
-                         int src_voffset, int src_hoffset, int sstride,
-                         int vsize, int hsize) {
-  int r, c;
-  const uint8_t *src8 = (uint8_t *)src;
-  const uint8_t *base = &src8[src_voffset * sstride + src_hoffset];
-  for (r = 0; r < vsize; r++)
-    for (c = 0; c < hsize; c++)
-      dst[r * dstride + c] = (uint16_t)base[r * sstride + c];
-}
-
 static INLINE void init_src_params(int *src_stride, int *width, int *height,
                                    int *width_log2, int *height_log2,
                                    BLOCK_SIZE bsize) {
@@ -267,6 +257,22 @@ static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
   return sum >> 2 * coeff_shift;
 }
 #endif
+
+// Checks dual and quad block processing is applicable for block widths 8 and 4
+// respectively.
+static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width,
+                                             int cdef_count, int bi, int iter) {
+  assert(width == 8 || width == 4);
+  const int blk_offset = (width == 8) ? 1 : 3;
+  if ((iter + blk_offset) >= cdef_count) return 0;
+
+  if (dlist[bi].by == dlist[bi + blk_offset].by &&
+      dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx)
+    return 1;
+
+  return 0;
+}
+
 static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
                                   cdef_list *dlist, int cdef_count,
                                   BLOCK_SIZE bsize, int coeff_shift, int row,
@@ -275,21 +281,232 @@ static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
          bsize == BLOCK_8X8);
   uint64_t sum = 0;
   int bi, bx, by;
+  int iter = 0;
+  int inc = 1;
   uint8_t *dst8 = (uint8_t *)dst;
   uint8_t *dst_buff = &dst8[row * dstride + col];
   int src_stride, width, height, width_log2, height_log2;
   init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
                   bsize);
-  for (bi = 0; bi < cdef_count; bi++) {
+
+  const int num_blks = 16 / width;
+  for (bi = 0; bi < cdef_count; bi += inc) {
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    sum += aom_mse_wxh_16bit(
-        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
-        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+    uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)];
+    uint8_t *dst_tmp =
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)];
+
+    if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) {
+      sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height);
+      iter += num_blks;
+      inc = num_blks;
+    } else {
+      sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width,
+                               height);
+      iter += 1;
+      inc = 1;
+    }
   }
+
   return sum >> 2 * coeff_shift;
 }
 
+// Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the
+// region is outside frame boundary
+static INLINE void fill_borders_for_fbs_on_frame_boundary(
+    uint16_t *inbuf, int hfilt_size, int vfilt_size,
+    bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary,
+    bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) {
+  if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary &&
+      !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary)
+    return;
+  if (is_fb_on_frm_bottom_boundary) {
+    // Fill bottom region of the block
+    const int buf_offset =
+        (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER;
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) {
+    const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE;
+    // Fill bottom-left region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) {
+    const int buf_offset =
+        (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER;
+    // Fill bottom-right region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_top_boundary) {
+    // Fill top region of the block
+    fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) {
+    // Fill top-left region of the block
+    fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) {
+    const int buf_offset = hfilt_size + CDEF_HBORDER;
+    // Fill top-right region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_left_boundary) {
+    const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+    // Fill left region of the block
+    fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (is_fb_on_frm_right_boundary) {
+    const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+    // Fill right region of the block
+    fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE,
+              vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+}
+
+// Calculate the number of 8x8/4x4 filter units for which SSE can be calculated
+// after CDEF filtering in single function call
+static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units(
+    cdef_list *dlist, int cdef_count, int bi, int subsampling_x,
+    int subsampling_y) {
+  // TODO(Ranjit): Extend the optimization for 422
+  if (subsampling_x != subsampling_y) return 1;
+
+  // Combining more blocks seems to increase encode time due to increase in
+  // control code
+  if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by &&
+      dlist[bi].bx + 3 == dlist[bi + 3].bx) {
+    /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific
+     * logic if y co-ordinates match and x co-ordinates are
+     * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */
+    return 4;
+  }
+  if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by &&
+      dlist[bi].bx + 1 == dlist[bi + 1].bx) {
+    /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific
+     * logic if their y co-ordinates match and x co-ordinates are
+     * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */
+    return 2;
+  }
+  return 1;
+}
+
+// Returns the block error after CDEF filtering for a given strength
+static INLINE uint64_t get_filt_error(
+    const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd,
+    cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+    int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer,
+    int ref_stride, int row, int col, int pri_strength, int sec_strength,
+    int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) {
+  uint64_t curr_sse = 0;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y);
+  const int bw_log2 = 3 - pd->subsampling_x;
+  const int bh_log2 = 3 - pd->subsampling_y;
+
+  // TODO(Ranjit): Extend this optimization for HBD
+  if (!cdef_search_ctx->use_highbitdepth) {
+    // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the
+    // error at CDEF block level
+    const int tot_blk_count =
+        (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >>
+        (bw_log2 + bh_log2);
+    if (cdef_count == tot_blk_count) {
+      // Calculate the offset in the buffer based on block position
+      const FULLPEL_MV this_mv = { row, col };
+      const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+      if (pri_strength == 0 && sec_strength == 0) {
+        // When CDEF strength is zero, filtering is not applied. Hence
+        // error is calculated between source and unfiltered pixels
+        curr_sse =
+            aom_sse(&ref_buffer[buf_offset], ref_stride,
+                    get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+                    block_size_wide[plane_bsize], block_size_high[plane_bsize]);
+      } else {
+        DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+        av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+                           cdef_search_ctx->xdec[pli],
+                           cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+                           dlist, cdef_count, pri_strength,
+                           sec_strength + (sec_strength == 3),
+                           cdef_search_ctx->damping, coeff_shift);
+        curr_sse =
+            aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8,
+                    (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize],
+                    block_size_high[plane_bsize]);
+      }
+    } else {
+      // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering
+      // functions produce 8-bit output and the error is calculated in 8-bit
+      // domain
+      if (pri_strength == 0 && sec_strength == 0) {
+        int num_error_calc_filt_units = 1;
+        for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+          const uint8_t by = dlist[bi].by;
+          const uint8_t bx = dlist[bi].bx;
+          const int16_t by_pos = (by << bh_log2);
+          const int16_t bx_pos = (bx << bw_log2);
+          // Calculate the offset in the buffer based on block position
+          const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+          const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+          num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+              dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+          curr_sse += aom_sse(
+              &ref_buffer[buf_offset], ref_stride,
+              get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+              num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+        }
+      } else {
+        DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+        av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+                           cdef_search_ctx->xdec[pli],
+                           cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+                           dlist, cdef_count, pri_strength,
+                           sec_strength + (sec_strength == 3),
+                           cdef_search_ctx->damping, coeff_shift);
+        int num_error_calc_filt_units = 1;
+        for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+          const uint8_t by = dlist[bi].by;
+          const uint8_t bx = dlist[bi].bx;
+          const int16_t by_pos = (by << bh_log2);
+          const int16_t bx_pos = (bx << bw_log2);
+          // Calculate the offset in the buffer based on block position
+          const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+          const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos };
+          const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+          const int tmp_buf_offset =
+              get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2));
+          num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+              dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+          curr_sse += aom_sse(
+              &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset],
+              (1 << MAX_SB_SIZE_LOG2),
+              num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+        }
+      }
+    }
+  } else {
+    DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+    av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
+                       cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
+                       dir, dirinit, var, pli, dlist, cdef_count, pri_strength,
+                       sec_strength + (sec_strength == 3),
+                       cdef_search_ctx->damping, coeff_shift);
+    curr_sse = cdef_search_ctx->compute_cdef_dist_fn(
+        ref_buffer, ref_stride, tmp_dst, dlist, cdef_count,
+        cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+  }
+  return curr_sse;
+}
+
 // Calculates MSE at block level.
 // Inputs:
 //   cdef_search_ctx: Pointer to the structure containing parameters related to
@@ -307,7 +524,6 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
   const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
 
   // Declare and initialize the temporary buffers.
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
   DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
   cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
   int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
@@ -345,39 +561,43 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
   // 8x8 blocks which are not skip.
   const int cdef_count = av1_cdef_compute_sb_list(
       mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
-
-  const int yoff = CDEF_VBORDER * (fbr != 0);
-  const int xoff = CDEF_HBORDER * (fbc != 0);
+  const bool is_fb_on_frm_left_boundary = (fbc == 0);
+  const bool is_fb_on_frm_right_boundary =
+      (fbc + hb_step == cdef_search_ctx->nhfb);
+  const bool is_fb_on_frm_top_boundary = (fbr == 0);
+  const bool is_fb_on_frm_bottom_boundary =
+      (fbr + vb_step == cdef_search_ctx->nvfb);
+  const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary);
+  const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary);
   int dirinit = 0;
   for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) {
-    for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
     /* We avoid filtering the pixels for which some of the pixels to
     average are outside the frame. We could change the filter instead,
     but it would add special cases for any future vectorization. */
-    const int ysize = (nvb << mi_high_l2[pli]) +
-                      CDEF_VBORDER * (fbr + vb_step < cdef_search_ctx->nvfb) +
-                      yoff;
-    const int xsize = (nhb << mi_wide_l2[pli]) +
-                      CDEF_HBORDER * (fbc + hb_step < cdef_search_ctx->nhfb) +
-                      xoff;
+    const int hfilt_size = (nhb << mi_wide_l2[pli]);
+    const int vfilt_size = (nvb << mi_high_l2[pli]);
+    const int ysize =
+        vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff;
+    const int xsize =
+        hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff;
     const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
     const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
     struct macroblockd_plane pd = cdef_search_ctx->plane[pli];
     cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
                              pd.dst.buf, row - yoff, col - xoff, pd.dst.stride,
                              ysize, xsize);
+    fill_borders_for_fbs_on_frame_boundary(
+        inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary,
+        is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary,
+        is_fb_on_frm_bottom_boundary);
     for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) {
       int pri_strength, sec_strength;
       get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength,
                                 &sec_strength, gi);
-      av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
-                         cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
-                         dir, &dirinit, var, pli, dlist, cdef_count,
-                         pri_strength, sec_strength + (sec_strength == 3),
-                         cdef_search_ctx->damping, coeff_shift);
-      const uint64_t curr_mse = cdef_search_ctx->compute_cdef_dist_fn(
-          ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
-          cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+      const uint64_t curr_mse = get_filt_error(
+          cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli],
+          ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count,
+          pli, coeff_shift, bs);
       if (pli < 2)
         cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse;
       else
@@ -476,6 +696,7 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
   cdef_search_ctx->num_planes = num_planes;
   cdef_search_ctx->pick_method = pick_method;
   cdef_search_ctx->sb_count = 0;
+  cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth;
   av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
                        num_planes);
   // Initialize plane wise information.
@@ -495,20 +716,20 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
   // Function pointer initialization.
 #if CONFIG_AV1_HIGHBITDEPTH
   if (cm->seq_params->use_highbitdepth) {
-    cdef_search_ctx->copy_fn = copy_sb16_16_highbd;
+    cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd;
     cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
   } else {
-    cdef_search_ctx->copy_fn = copy_sb16_16;
+    cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
     cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
   }
 #else
-  cdef_search_ctx->copy_fn = copy_sb16_16;
+  cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
   cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
 #endif
 }
 
 static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
-                              int frames_since_key, int is_screen_content) {
+                              int is_screen_content) {
   const int bd = cm->seq_params->bit_depth;
   const int q =
       av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
@@ -574,10 +795,14 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
   cdef_info->cdef_uv_strengths[0] =
       predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
 
+  // mbmi->cdef_strength is already set in the encoding stage. We don't need to
+  // set it again here.
   if (skip_cdef) {
     cdef_info->cdef_strengths[1] = 0;
     cdef_info->cdef_uv_strengths[1] = 0;
+    return;
   }
+
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
@@ -586,10 +811,6 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
     for (int c = 0; c < nhfb; ++c) {
       MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
       current_mbmi->cdef_strength = 0;
-      if (skip_cdef && current_mbmi->skip_cdef_curr_sb &&
-          frames_since_key > 10) {
-        current_mbmi->cdef_strength = 1;
-      }
     }
     mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
   }
@@ -598,9 +819,8 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
 void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
                      MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
-                     int skip_cdef_feature, int frames_since_key,
-                     CDEF_CONTROL cdef_control, const int is_screen_content,
-                     int non_reference_frame) {
+                     int skip_cdef_feature, CDEF_CONTROL cdef_control,
+                     const int is_screen_content, int non_reference_frame) {
   assert(cdef_control != CDEF_NONE);
   if (cdef_control == CDEF_REFERENCE && non_reference_frame) {
     CdefInfo *const cdef_info = &cm->cdef_info;
@@ -612,8 +832,7 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
   }
 
   if (pick_method == CDEF_PICK_FROM_Q) {
-    pick_cdef_from_qp(cm, skip_cdef_feature, frames_since_key,
-                      is_screen_content);
+    pick_cdef_from_qp(cm, skip_cdef_feature, is_screen_content);
     return;
   }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -648,7 +867,15 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
   mse[0] = cdef_search_ctx.mse[0];
   mse[1] = cdef_search_ctx.mse[1];
+  /* Calculate the maximum number of bits required to signal CDEF strengths at
+   * block level */
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  const int joint_strengths =
+      num_planes > 1 ? total_strengths * total_strengths : total_strengths;
+  const int max_signaling_bits =
+      joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1;
   for (int i = 0; i <= 3; i++) {
+    if (i > max_signaling_bits) break;
     int best_lev0[CDEF_MAX_STRENGTHS];
     int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
     const int nb_strengths = 1 << i;
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index e070a8a45..548a7401f 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h
@@ -65,7 +65,7 @@ static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
   TOTAL_STRENGTHS
 };
 
-typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src,
                           int src_voffset, int src_hoffset, int sstride,
                           int vsize, int hsize);
 typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
@@ -164,6 +164,11 @@ typedef struct {
    * Holds the count of cdef filtered blocks
    */
   int sb_count;
+  /*!
+   * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth
+   * is > 8-bit
+   */
+  bool use_highbitdepth;
 } CdefSearchCtx;
 
 static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
@@ -226,13 +231,12 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
  * \param[in]      pick_method  The method used to select params
  * \param[in]      rdmult       rd multiplier to use in making param choices
  * \param[in]      skip_cdef_feature Speed feature to skip cdef
- * \param[in]      frames_since_key Number of frames since key frame
  * \param[in]      cdef_control  Parameter that controls CDEF application
  * \param[in]      is_screen_content   Whether it is screen content type
  * \param[in]      non_reference_frame Indicates if current frame is
  * non-reference
  *
- * \return Nothing is returned. Instead, optimal CDEF parameters are stored
+ * \remark Nothing is returned. Instead, optimal CDEF parameters are stored
  * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
  * \arg \c cdef_bits: Bits of strength parameters
  * \arg \c nb_cdef_strengths: Number of strength parameters
@@ -247,9 +251,8 @@ void av1_cdef_search(struct MultiThreadInfo *mt_info,
                      const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
                      MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
-                     int skip_cdef_feature, int frames_since_key,
-                     CDEF_CONTROL cdef_control, const int is_screen_content,
-                     int non_reference_frame);
+                     int skip_cdef_feature, CDEF_CONTROL cdef_control,
+                     const int is_screen_content, int non_reference_frame);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 3d3020aea..90c3c1ae8 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -202,7 +202,6 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   (void)sd;
 
   lf->sharpness_level = 0;
-  cpi->td.mb.rdmult = cpi->rd.RDMULT;
 
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
       cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
@@ -212,7 +211,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   if (disable_filter_rt_screen ||
       cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE ||
       (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE &&
-       cpi->svc.non_reference_frame)) {
+       cpi->ppi->rtc_ref.non_reference_frame)) {
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
     return;
diff --git a/av1/encoder/picklpf.h b/av1/encoder/picklpf.h
index 727335517..f567937c3 100644
--- a/av1/encoder/picklpf.h
+++ b/av1/encoder/picklpf.h
@@ -43,7 +43,7 @@ int av1_get_max_filter_level(const AV1_COMP *cpi);
  * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last
  * frame
  *
- * \return Nothing is returned. Instead, filter levels below are stored in the
+ * \remark Nothing is returned. Instead, filter levels below are stored in the
  * "loopfilter" structure inside "cpi":
  * \arg \c filter_level[0]: the vertical filter level for Y plane
  * \arg \c filter_level[1]: the horizontal filter level for Y plane
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 008c469be..dc599a8f7 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -148,7 +148,7 @@ typedef struct {
   // tile in the frame.
   SgrprojInfo sgrproj;
   WienerInfo wiener;
-  AV1PixelRect tile_rect;
+  PixelRect tile_rect;
 } RestSearchCtxt;
 
 static AOM_INLINE void rsc_on_tile(void *priv) {
@@ -191,7 +191,7 @@ static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
 
 static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
                                     const RestorationTileLimits *limits,
-                                    const AV1PixelRect *tile_rect,
+                                    const PixelRect *tile_rect,
                                     const RestorationUnitInfo *rui) {
   const AV1_COMMON *const cm = rsc->cm;
   const int plane = rsc->plane;
@@ -873,9 +873,8 @@ static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
 }
 
 static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
-                                      const AV1PixelRect *tile,
-                                      int rest_unit_idx, void *priv,
-                                      int32_t *tmpbuf,
+                                      const PixelRect *tile, int rest_unit_idx,
+                                      void *priv, int32_t *tmpbuf,
                                       RestorationLineBuffers *rlbs) {
   (void)rlbs;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
@@ -1105,7 +1104,7 @@ static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
       for (int j = 0; j < n; j++) {
         A[(i + 1) * stride + j] -= c / 256 * A[k * stride + j] / cd * 256;
       }
-      b[i + 1] -= c * b[k] / cd;
+      b[i + 1] -= c / 256 * b[k] / cd * 256;
     }
   }
   // Back-substitution
@@ -1389,7 +1388,7 @@ static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
 #define USE_WIENER_REFINEMENT_SEARCH 1
 static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
                                         const RestorationTileLimits *limits,
-                                        const AV1PixelRect *tile,
+                                        const PixelRect *tile,
                                         RestorationUnitInfo *rui,
                                         int wiener_win) {
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
@@ -1495,7 +1494,7 @@ static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
 }
 
 static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
-                                     const AV1PixelRect *tile_rect,
+                                     const PixelRect *tile_rect,
                                      int rest_unit_idx, void *priv,
                                      int32_t *tmpbuf,
                                      RestorationLineBuffers *rlbs) {
@@ -1630,7 +1629,7 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
 }
 
 static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
-                                        const AV1PixelRect *tile_rect,
+                                        const PixelRect *tile_rect,
                                         int rest_unit_idx, void *priv,
                                         int32_t *tmpbuf,
                                         RestorationLineBuffers *rlbs) {
@@ -1649,7 +1648,7 @@ static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
 }
 
 static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
-                                         const AV1PixelRect *tile_rect,
+                                         const PixelRect *tile_rect,
                                          int rest_unit_idx, void *priv,
                                          int32_t *tmpbuf,
                                          RestorationLineBuffers *rlbs) {
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index 46a4b48f2..94a6932de 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -65,7 +65,7 @@ static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
  * \param[in]       sd           Source frame buffer
  * \param[in,out]   cpi          Top-level encoder structure
  *
- * \return Nothing is returned. Instead, chosen restoration filter
+ * \remark Nothing is returned. Instead, chosen restoration filter
  * types and parameters are stored per plane in the \c rst_info structure
  * of type \ref RestorationInfo inside \c cpi->common:
  * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index a7d309e9c..951848056 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -34,6 +34,8 @@
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
+#include "config/aom_dsp_rtcd.h"
+
 #define USE_UNRESTRICTED_Q_IN_CQ_MODE 0
 
 // Max rate target for 1080P and below encodes under normal circumstances
@@ -159,15 +161,45 @@ double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
   }
 }
 
-int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor, aom_bit_depth_t bit_depth,
-                       const int is_screen_content_type) {
-  const double q = av1_convert_qindex_to_q(qindex, bit_depth);
-  int enumerator = frame_type == KEY_FRAME ? 2000000 : 1500000;
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+                            const int is_screen_content_type) {
+  int enumerator;
+
   if (is_screen_content_type) {
-    enumerator = frame_type == KEY_FRAME ? 1000000 : 750000;
+    enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000;
+  } else {
+    enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000;
   }
 
+  return enumerator;
+}
+
+int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, int accurate_estimate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_screen_content_type = cpi->is_screen_content_type;
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+  const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+
+  const int min_dim = AOMMIN(cm->width, cm->height);
+
+  if (frame_type != KEY_FRAME && accurate_estimate) {
+    assert(cpi->rec_sse != UINT64_MAX);
+    const int mbs = cm->mi_params.MBs;
+    const int res = (min_dim < 480) ? 0 : ((min_dim < 720) ? 1 : 2);
+    const double sse_over_q2 = (double)(cpi->rec_sse << BPER_MB_NORMBITS) /
+                               ((double)q * q) / (double)mbs;
+    const double coef[3][2] = {
+      { 0.535, 3000.0 },  // < 480
+      { 0.590, 3000.0 },  // < 720
+      { 0.485, 1000.0 }   // 720
+    };
+    int bits = (int)(coef[res][0] * sse_over_q2 + coef[res][1]);
+    return (int)(bits * correction_factor);
+  }
+
+  const int enumerator =
+      av1_get_bpmb_enumerator(frame_type, is_screen_content_type);
   assert(correction_factor <= MAX_BPB_FACTOR &&
          correction_factor >= MIN_BPB_FACTOR);
 
@@ -175,11 +207,14 @@ int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
   return (int)(enumerator * correction_factor / q);
 }
 
-int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
-                           double correction_factor, aom_bit_depth_t bit_depth,
-                           const int is_screen_content_type) {
-  const int bpm = (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor,
-                                           bit_depth, is_screen_content_type));
+int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q,
+                           double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  const int mbs = cm->mi_params.MBs;
+  const int bpm =
+      (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor,
+                               cpi->sf.hl_sf.accurate_bit_estimate));
   return AOMMAX(FRAME_OVERHEAD_BITS,
                 (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
@@ -372,6 +407,7 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
 
   rc->frames_till_gf_update_due = 0;
   rc->ni_av_qi = rc_cfg->worst_allowed_q;
@@ -442,13 +478,15 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
   }
 }
 
-static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
+static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
+                        int width, int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
   const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
-  const int max_delta_down =
-      (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 8 : 16;
+  const int max_delta_down = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+                                 ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16))
+                                 : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
   const int max_delta_up = 20;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
@@ -457,8 +495,8 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
   // then set this flag to indicate change in target bits per macroblock.
   const int change_target_bits_mb =
       cm->prev_frame &&
-      (cm->width != cm->prev_frame->width ||
-       cm->height != cm->prev_frame->height || change_avg_frame_bandwidth);
+      (width != cm->prev_frame->width || height != cm->prev_frame->height ||
+       change_avg_frame_bandwidth);
   // Apply some control/clamp to QP under certain conditions.
   if (cm->current_frame.frame_type != KEY_FRAME && !cpi->ppi->use_svc &&
       rc->frames_since_key > 1 && !change_target_bits_mb &&
@@ -511,8 +549,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
   // For single spatial layer: if resolution has increased push q closer
   // to the active_worst to avoid excess overshoot.
   if (cpi->svc.number_spatial_layers <= 1 && cm->prev_frame &&
-      (cm->width * cm->height >
-       1.5 * cm->prev_frame->width * cm->prev_frame->height))
+      (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height))
     q = (q + active_worst_quality) >> 1;
   return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
 }
@@ -606,7 +643,7 @@ static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
  * \param[in]   width                 Frame width
  * \param[in]   height                Frame height
  *
- * \return None but updates the rate correction factor for the
+ * \remark Updates the rate correction factor for the
  *         current frame type in cpi->rc.
  */
 static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage,
@@ -656,25 +693,41 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
   double rate_correction_factor =
       get_rate_correction_factor(cpi, width, height);
   double adjustment_limit;
-  const int MBs = av1_get_MBs(width, height);
   int projected_size_based_on_q = 0;
+  int cyclic_refresh_active =
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
 
   // Do not update the rate factors for arf overlay frames.
   if (cpi->rc.is_src_frame_alt_ref) return;
 
+  // Don't update rate correction factors here on scene changes as
+  // it is already reset in av1_encodedframe_overshoot_cbr(),
+  // but reset variables related to previous frame q and size.
+  // Note that the counter of frames since the last scene change
+  // is only valid when cyclic refresh mode is enabled and that
+  // this break out only applies to scene changes that are not
+  // recorded as INTRA only key frames.
+  if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) &&
+      (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) &&
+      (cm->current_frame.frame_type != KEY_FRAME) && (!cpi->ppi->use_svc)) {
+    cpi->rc.q_2_frame = cm->quant_params.base_qindex;
+    cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+    cpi->rc.rc_2_frame = 0;
+    cpi->rc.rc_1_frame = 0;
+    return;
+  }
+
   // Clear down mmx registers to allow floating point in what follows
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+  if (cyclic_refresh_active) {
     projected_size_based_on_q =
         av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
     projected_size_based_on_q = av1_estimate_bits_at_q(
-        cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
-        rate_correction_factor, cm->seq_params->bit_depth,
-        cpi->is_screen_content_type);
+        cpi, cm->quant_params.base_qindex, rate_correction_factor);
   }
   // Work out a size correction factor.
   if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
@@ -682,7 +735,17 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
                         (double)projected_size_based_on_q;
 
   // Clamp correction factor to prevent anything too extreme
-  correction_factor = AOMMIN(AOMMAX(correction_factor, 0.25), 4.0);
+  correction_factor = AOMMAX(correction_factor, 0.25);
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 1.1)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 0.9)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
 
   // Decide how heavily to dampen the adjustment
   if (correction_factor > 0.0) {
@@ -697,15 +760,23 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
     adjustment_limit = 0.75;
   }
 
-  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
-  cpi->rc.q_1_frame = cm->quant_params.base_qindex;
-  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
-  if (correction_factor > 1.1)
-    cpi->rc.rc_1_frame = -1;
-  else if (correction_factor < 0.9)
-    cpi->rc.rc_1_frame = 1;
-  else
-    cpi->rc.rc_1_frame = 0;
+  // Adjustment to delta Q and number of blocks updated in cyclic refressh
+  // based on over or under shoot of target in current frame.
+  if (cyclic_refresh_active && (cpi->rc.this_frame_target > 0) &&
+      !cpi->ppi->use_svc) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    if (correction_factor > 1.25) {
+      cr->percent_refresh_adjustment =
+          AOMMAX(cr->percent_refresh_adjustment - 1, -5);
+      cr->rate_ratio_qdelta_adjustment =
+          AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0);
+    } else if (correction_factor < 0.5) {
+      cr->percent_refresh_adjustment =
+          AOMMIN(cr->percent_refresh_adjustment + 1, 5);
+      cr->rate_ratio_qdelta_adjustment =
+          AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25);
+    }
+  }
 
   if (correction_factor > 1.01) {
     // We are not already at the worst allowable quality
@@ -737,9 +808,9 @@ static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
   const AV1_COMMON *const cm = &cpi->common;
   return use_cyclic_refresh
              ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
-             : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
-                                  correction_factor, cm->seq_params->bit_depth,
-                                  cpi->is_screen_content_type);
+             : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q,
+                                  correction_factor,
+                                  cpi->sf.hl_sf.accurate_bit_estimate);
 }
 
 /*!\brief Searches for a Q index value predicted to give an average macro
@@ -822,7 +893,7 @@ int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
       find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
                                   active_best_quality, active_worst_quality);
   if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi))
-    return adjust_q_cbr(cpi, q, active_worst_quality);
+    return adjust_q_cbr(cpi, q, active_worst_quality, width, height);
 
   return q;
 }
@@ -1092,7 +1163,6 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   int q;
-  const int bit_depth = cm->seq_params->bit_depth;
   int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
   int active_best_quality = calc_active_best_quality_no_stats_cbr(
       cpi, active_worst_quality, width, height);
@@ -1112,33 +1182,28 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
   if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
       current_frame->frame_number != 0) {
     int qdelta = 0;
-    qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                        active_worst_quality, 2.0,
-                                        cpi->is_screen_content_type, bit_depth);
+    qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+                                        active_worst_quality, 2.0);
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
   }
 
-  // Special case code to try and match quality with forced key frames
-  if (current_frame->frame_type == KEY_FRAME && p_rc->this_key_frame_forced) {
-    q = p_rc->last_boosted_qindex;
-  } else {
-    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
-                          active_worst_quality, width, height);
+  q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                        active_worst_quality, width, height);
 #if RT_PASSIVE_STRATEGY
-    if (current_frame->frame_type != KEY_FRAME &&
-        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
-      q = get_q_passive_strategy(cpi, q, 50);
-    }
+  if (current_frame->frame_type != KEY_FRAME &&
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    q = get_q_passive_strategy(cpi, q, 50);
+  }
 #endif  // RT_PASSIVE_STRATEGY
-    if (q > *top_index) {
-      // Special case when we are targeting the max allowed rate
-      if (rc->this_frame_target >= rc->max_frame_bandwidth)
-        *top_index = q;
-      else
-        q = *top_index;
-    }
+  if (q > *top_index) {
+    // Special case when we are targeting the max allowed rate
+    if (rc->this_frame_target >= rc->max_frame_bandwidth)
+      *top_index = q;
+    else
+      q = *top_index;
   }
+
   // Special case: we force the first few frames to use low q such that
   // these frames are encoded at a high quality, which provides good
   // references for following frames.
@@ -1342,14 +1407,12 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
     int qdelta = 0;
     if (current_frame->frame_type == KEY_FRAME &&
         !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
-      qdelta = av1_compute_qdelta_by_rate(
-          &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0,
-          cpi->is_screen_content_type, bit_depth);
+      qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+                                          active_worst_quality, 2.0);
     } else if (!rc->is_src_frame_alt_ref &&
                (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
-      qdelta = av1_compute_qdelta_by_rate(
-          &cpi->rc, current_frame->frame_type, active_worst_quality, 1.75,
-          cpi->is_screen_content_type, bit_depth);
+      qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+                                          active_worst_quality, 1.75);
     }
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
@@ -1400,9 +1463,7 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
   const double rate_factor =
       (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
 
-  return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
-                                    cpi->is_screen_content_type,
-                                    cpi->common.seq_params->bit_depth);
+  return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor);
 }
 
 // This unrestricted Q selection on CQ mode is useful when testing new features,
@@ -1487,7 +1548,7 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
     double q_adj_factor = 1.0;
     double q_val;
 
-    // Baseline value derived from cpi->active_worst_quality and kf boost.
+    // Baseline value derived from active_worst_quality and kf boost.
     active_best_quality =
         get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
     if (cpi->is_screen_content_type) {
@@ -1540,7 +1601,6 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
-  const int bit_depth = cpi->common.seq_params->bit_depth;
   int active_best_quality = *active_best;
   int active_worst_quality = *active_worst;
 #if CONFIG_FPMT_TEST
@@ -1595,9 +1655,8 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
 
   // Modify active_best_quality for downscaled normal frames.
   if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
-    int qdelta = av1_compute_qdelta_by_rate(
-        rc, cm->current_frame.frame_type, active_best_quality, 2.0,
-        cpi->is_screen_content_type, bit_depth);
+    int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+                                            active_best_quality, 2.0);
     active_best_quality =
         AOMMAX(active_best_quality + qdelta, rc->best_quality);
   }
@@ -1935,8 +1994,65 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
-                             int gf_index, int *bottom_index, int *top_index) {
+static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source;
+  if (unscaled_src == NULL) return;
+
+  const uint8_t *src_y = unscaled_src->y_buffer;
+  const int src_ystride = unscaled_src->y_stride;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const uint8_t *pre_y = yv12->buffers[0];
+  const int pre_ystride = yv12->strides[0];
+
+  // TODO(yunqing): support scaled reference frames.
+  if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return;
+
+  const int num_mi_cols = cm->mi_params.mi_cols;
+  const int num_mi_rows = cm->mi_params.mi_rows;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  int num_samples = 0;
+  // sse is computed on 64x64 blocks
+  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                ? (cm->seq_params->mib_size >> 1)
+                                : cm->seq_params->mib_size;
+  const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+  const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+
+  uint64_t fsse = 0;
+  cpi->rec_sse = 0;
+
+  for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+    for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+      unsigned int sse;
+      uint8_t src[64 * 64] = { 0 };
+      // Apply 4x4 block averaging/denoising on source frame.
+      for (int i = 0; i < 64; i += 4) {
+        for (int j = 0; j < 64; j += 4) {
+          const unsigned int avg =
+              aom_avg_4x4(src_y + i * src_ystride + j, src_ystride);
+
+          for (int m = 0; m < 4; ++m) {
+            for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg;
+          }
+        }
+      }
+
+      cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse);
+      fsse += sse;
+      num_samples++;
+      src_y += 64;
+      pre_y += 64;
+    }
+    src_y += (src_ystride << 6) - (sb_cols << 6);
+    pre_y += (pre_ystride << 6) - (sb_cols << 6);
+  }
+  assert(num_samples > 0);
+  if (num_samples > 0) cpi->rec_sse = fsse;
+}
+
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index,
+                             int *bottom_index, int *top_index) {
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
   // TODO(sarahparker) merge no-stats vbr and altref q computation
@@ -1946,8 +2062,17 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
        gf_group->update_type[gf_index] == ARF_UPDATE) &&
       has_no_stats_stage(cpi)) {
     if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+      // TODO(yunqing): the results could be used for encoder optimization.
+      cpi->rec_sse = UINT64_MAX;
+      if (cpi->sf.hl_sf.accurate_bit_estimate &&
+          cpi->common.current_frame.frame_type != KEY_FRAME)
+        rc_compute_variance_onepass_rt(cpi);
+
       q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index,
                                             top_index);
+      // preserve copy of active worst quality selected.
+      cpi->rc.active_worst_quality = *top_index;
+
 #if USE_UNRESTRICTED_Q_IN_CQ_MODE
     } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
       q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index,
@@ -2050,6 +2175,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
         ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
     if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
+        cpi->rc.rtc_external_ratectrl ||
         (!rc->is_src_frame_alt_ref &&
          !(refresh_frame->golden_frame || is_intrnl_arf ||
            refresh_frame->alt_ref_frame))) {
@@ -2136,6 +2262,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   }
 #endif
   if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  if (cpi->refresh_frame.golden_frame)
+    rc->frame_num_last_gf_refresh = current_frame->frame_number;
+  rc->prev_coded_width = cm->width;
+  rc->prev_coded_height = cm->height;
   // if (current_frame->frame_number == 1 && cm->show_frame)
   /*
   rc->this_frame_target =
@@ -2147,11 +2277,15 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
   // Update buffer level with zero size, update frame counters, and return.
   update_buffer_level(cpi, 0);
-  cpi->rc.frames_since_key++;
-  cpi->rc.frames_to_key--;
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+    cpi->rc.frames_since_key++;
+    cpi->rc.frames_to_key--;
+  }
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
   cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+  cpi->rc.prev_coded_width = cpi->common.width;
+  cpi->rc.prev_coded_height = cpi->common.height;
 }
 
 int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
@@ -2188,17 +2322,16 @@ int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
 // To be precise, 'q_index' is the smallest integer, for which the corresponding
 // bits per mb <= desired_bits_per_mb.
 // If no such q index is found, returns 'worst_qindex'.
-static int find_qindex_by_rate(int desired_bits_per_mb,
-                               aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
-                               const int is_screen_content_type,
+static int find_qindex_by_rate(const AV1_COMP *const cpi,
+                               int desired_bits_per_mb, FRAME_TYPE frame_type,
                                int best_qindex, int worst_qindex) {
   assert(best_qindex <= worst_qindex);
   int low = best_qindex;
   int high = worst_qindex;
   while (low < high) {
     const int mid = (low + high) >> 1;
-    const int mid_bits_per_mb = av1_rc_bits_per_mb(
-        frame_type, mid, 1.0, bit_depth, is_screen_content_type);
+    const int mid_bits_per_mb =
+        av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0);
     if (mid_bits_per_mb > desired_bits_per_mb) {
       low = mid + 1;
     } else {
@@ -2206,26 +2339,25 @@ static int find_qindex_by_rate(int desired_bits_per_mb,
     }
   }
   assert(low == high);
-  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth,
-                            is_screen_content_type) <= desired_bits_per_mb ||
+  assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <=
+             desired_bits_per_mb ||
          low == worst_qindex);
   return low;
 }
 
-int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
-                               int qindex, double rate_target_ratio,
-                               const int is_screen_content_type,
-                               aom_bit_depth_t bit_depth) {
+int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio) {
+  const RATE_CONTROL *rc = &cpi->rc;
+
   // Look up the current projected bits per block for the base index
-  const int base_bits_per_mb = av1_rc_bits_per_mb(
-      frame_type, qindex, 1.0, bit_depth, is_screen_content_type);
+  const int base_bits_per_mb =
+      av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0);
 
   // Find the target bits per mb based on the base value and given ratio.
   const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
   const int target_index = find_qindex_by_rate(
-      target_bits_per_mb, bit_depth, frame_type, is_screen_content_type,
-      rc->best_quality, rc->worst_quality);
+      cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality);
   return target_index - qindex;
 }
 
@@ -2478,7 +2610,7 @@ int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
     }
   } else {
     int kf_boost = 32;
-    double framerate = cpi->framerate;
+    int framerate = (int)round(cpi->framerate);
 
     kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
     if (rc->frames_since_key < framerate / 2) {
@@ -2489,14 +2621,34 @@ int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
-static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+static void set_golden_update(AV1_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  int divisor = 10;
   if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
-    av1_cyclic_refresh_set_golden_update(cpi);
+    divisor = cpi->cyclic_refresh->percent_refresh;
+
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
+  const int gf_length_mult[2] = { 8, 4 };
+  if (divisor > 0)
+    p_rc->baseline_gf_interval =
+        AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor),
+               MAX_GF_INTERVAL_RT);
   else
-    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+  if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+    p_rc->baseline_gf_interval = 16;
+}
+
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  set_golden_update(cpi);
+
   if (p_rc->baseline_gf_interval > rc->frames_to_key &&
       cpi->oxcf.kf_cfg.auto_key)
     p_rc->baseline_gf_interval = rc->frames_to_key;
@@ -2536,7 +2688,7 @@ static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
 void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  SVC *const svc = &cpi->svc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
   const int resize_pending = is_frame_resize_pending(cpi);
   if (!resize_pending && !rc->high_source_sad) {
     // Check if we should disable GF refresh (if period is up),
@@ -2548,10 +2700,12 @@ void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
         rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10);
     int gf_update_changed = 0;
     int thresh = 87;
-    if (rc->frames_till_gf_update_due == 1 &&
+    if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) <
+            FIXED_GF_INTERVAL_RT &&
+        rc->frames_till_gf_update_due == 1 &&
         cm->quant_params.base_qindex > avg_qp) {
-      // Disable GF refresh since QP is above the runninhg average QP.
-      svc->refresh[svc->gld_idx_1layer] = 0;
+      // Disable GF refresh since QP is above the running average QP.
+      rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0;
       gf_update_changed = 1;
       cpi->refresh_frame.golden_frame = 0;
     } else if (allow_gf_update &&
@@ -2559,7 +2713,7 @@ void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
                 (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) {
       // Force refresh since QP is well below average QP or this is a high
       // motion frame.
-      svc->refresh[svc->gld_idx_1layer] = 1;
+      rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1;
       gf_update_changed = 1;
       cpi->refresh_frame.golden_frame = 1;
     }
@@ -2567,8 +2721,9 @@ void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
       set_baseline_gf_interval(cpi, INTER_FRAME);
       int refresh_mask = 0;
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
-        int ref_frame_map_idx = svc->ref_idx[i];
-        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+        int ref_frame_map_idx = rtc_ref->ref_idx[i];
+        refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+                        << ref_frame_map_idx;
       }
       cm->current_frame.refresh_frame_flags = refresh_mask;
     }
@@ -2587,19 +2742,18 @@ void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
  * \param[in]       cpi          Top level encoder structure
  * \param[in]       gf_update    Flag to indicate if GF is updated
  *
- * \return Nothing is returned. Instead the settings for the prediction
+ * \remark Nothing is returned. Instead the settings for the prediction
  * structure are set in \c cpi-ext_flags; and the buffer slot index
  * (for each of 7 references) and refresh flags (for each of the 8 slots)
  * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[].
  */
-void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
+void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) {
   AV1_COMMON *const cm = &cpi->common;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
   RATE_CONTROL *const rc = &cpi->rc;
   ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
       &ext_flags->refresh_frame;
-  SVC *const svc = &cpi->svc;
-  const int gld_fixed_slot = 1;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
   unsigned int lag_alt = 4;
   int last_idx = 0;
   int last_idx_refresh = 0;
@@ -2607,7 +2761,6 @@ void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
   int alt_ref_idx = 0;
   int last2_idx = 0;
   ext_refresh_frame_flags->update_pending = 1;
-  svc->set_ref_frame_config = 1;
   ext_flags->ref_frame_flags = 0;
   ext_refresh_frame_flags->last_frame = 1;
   ext_refresh_frame_flags->golden_frame = 0;
@@ -2630,33 +2783,28 @@ void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
     else if (rc->avg_source_sad > th_frame_sad[th_idx][2])
       lag_alt = 5;
   }
-  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7;
-  for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0;
+  // This defines the reference structure for 1 layer (non-svc) RTC encoding.
+  // To avoid the internal/default reference structure for non-realtime
+  // overwriting this behavior, we use the "svc" ref parameters from the
+  // external control SET_SVC_REF_FRAME_CONFIG.
+  // TODO(marpan): rename that control and the related internal parameters
+  // to rtc_ref.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7;
+  for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0;
   // Set the reference frame flags.
   ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
   ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
   ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
   if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
     ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
-  const int sh = 7 - gld_fixed_slot;
+  const int sh = 6;
   // Moving index slot for last: 0 - (sh - 1).
   if (cm->current_frame.frame_number > 1)
     last_idx = ((cm->current_frame.frame_number - 1) % sh);
   // Moving index for refresh of last: one ahead for next frame.
   last_idx_refresh = (cm->current_frame.frame_number % sh);
   gld_idx = 6;
-  if (!gld_fixed_slot) {
-    gld_idx = 7;
-    const unsigned int lag_gld = 7;  // Must be <= 7.
-    // Moving index for gld_ref, lag behind current by gld_interval frames.
-    if (cm->current_frame.frame_number > lag_gld)
-      gld_idx = ((cm->current_frame.frame_number - lag_gld) % sh);
-    // When golden is not long-term reference with fixed slot update but
-    // a reference with a moving slot with fixed lag behind last
-    // (i.e., gld_fixed_slot = 0), we should disable the
-    // gf_refresh_based_on_qp feature.
-    cpi->sf.rt_sf.gf_refresh_based_on_qp = 0;
-  }
+
   // Moving index for alt_ref, lag behind LAST by lag_alt frames.
   if (cm->current_frame.frame_number > lag_alt)
     alt_ref_idx = ((cm->current_frame.frame_number - lag_alt) % sh);
@@ -2665,32 +2813,31 @@ void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
     if (cm->current_frame.frame_number > 2)
       last2_idx = ((cm->current_frame.frame_number - 2) % sh);
   }
-  svc->ref_idx[0] = last_idx;          // LAST
-  svc->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
+  rtc_ref->ref_idx[0] = last_idx;          // LAST
+  rtc_ref->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
   if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
-    svc->ref_idx[1] = last2_idx;         // LAST2
-    svc->ref_idx[2] = last_idx_refresh;  // LAST3 (for refresh of last).
+    rtc_ref->ref_idx[1] = last2_idx;         // LAST2
+    rtc_ref->ref_idx[2] = last_idx_refresh;  // LAST3 (for refresh of last).
   }
-  svc->ref_idx[3] = gld_idx;      // GOLDEN
-  svc->ref_idx[6] = alt_ref_idx;  // ALT_REF
+  rtc_ref->ref_idx[3] = gld_idx;      // GOLDEN
+  rtc_ref->ref_idx[6] = alt_ref_idx;  // ALT_REF
   // Refresh this slot, which will become LAST on next frame.
-  svc->refresh[last_idx_refresh] = 1;
+  rtc_ref->refresh[last_idx_refresh] = 1;
   // Update GOLDEN on period for fixed slot case.
-  if (gld_fixed_slot && gf_update &&
-      cm->current_frame.frame_type != KEY_FRAME) {
+  if (gf_update && cm->current_frame.frame_type != KEY_FRAME) {
     ext_refresh_frame_flags->golden_frame = 1;
-    svc->refresh[gld_idx] = 1;
+    rtc_ref->refresh[gld_idx] = 1;
   }
-  svc->gld_idx_1layer = gld_idx;
+  rtc_ref->gld_idx_1layer = gld_idx;
   // Set the flag to reduce the number of reference frame buffers used.
   // This assumes that slot 7 is never used.
   cpi->rt_reduce_num_ref_buffers = 1;
-  cpi->rt_reduce_num_ref_buffers &= (svc->ref_idx[0] < 7);
-  cpi->rt_reduce_num_ref_buffers &= (svc->ref_idx[1] < 7);
-  cpi->rt_reduce_num_ref_buffers &= (svc->ref_idx[3] < 7);
-  cpi->rt_reduce_num_ref_buffers &= (svc->ref_idx[6] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7);
+  cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7);
   if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
-    cpi->rt_reduce_num_ref_buffers &= (svc->ref_idx[2] < 7);
+    cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7);
 }
 
 /*!\brief Check for scene detection, for 1 pass real-time mode.
@@ -2703,7 +2850,7 @@ void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
  * \param[in]       cpi          Top level encoder structure
  * \param[in]       frame_input  Current and last input source frames
  *
- * \return Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
+ * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
  * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated.
  */
 static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
@@ -2720,9 +2867,14 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   int last_src_ystride;
   int last_src_width;
   int last_src_height;
-  if (cm->spatial_layer_id != 0 || cm->width != cm->render_width ||
-      cm->height != cm->render_height || unscaled_src == NULL ||
-      unscaled_last_src == NULL) {
+  int width = cm->width;
+  int height = cm->height;
+  if (cpi->svc.number_spatial_layers > 1) {
+    width = cpi->oxcf.frm_dim_cfg.width;
+    height = cpi->oxcf.frm_dim_cfg.height;
+  }
+  if (width != cm->render_width || height != cm->render_height ||
+      unscaled_src == NULL || unscaled_last_src == NULL) {
     if (cpi->src_sad_blk_64x64) {
       aom_free(cpi->src_sad_blk_64x64);
       cpi->src_sad_blk_64x64 = NULL;
@@ -2745,98 +2897,108 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
     return;
   }
   rc->high_source_sad = 0;
-  rc->high_num_blocks_with_motion = 0;
+  rc->percent_blocks_with_motion = 0;
+  rc->max_block_source_sad = 0;
   rc->prev_avg_source_sad = rc->avg_source_sad;
-  if (src_width == last_src_width && src_height == last_src_height) {
-    const int num_mi_cols = cm->mi_params.mi_cols;
-    const int num_mi_rows = cm->mi_params.mi_rows;
-    int num_zero_temp_sad = 0;
-    uint32_t min_thresh = 10000;
-    if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) min_thresh = 100000;
-    const BLOCK_SIZE bsize = BLOCK_64X64;
-    int full_sampling = (cm->width * cm->height < 640 * 360) ? 1 : 0;
-    // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
-    uint64_t avg_sad = 0;
-    uint64_t tmp_sad = 0;
-    int num_samples = 0;
-    const int thresh = 6;
-    // SAD is computed on 64x64 blocks
-    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
-                                  ? (cm->seq_params->mib_size >> 1)
-                                  : cm->seq_params->mib_size;
-    const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
-    const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
-    uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
-    int num_low_var_high_sumdiff = 0;
-    int light_change = 0;
-    // Flag to check light change or not.
-    const int check_light_change = 0;
-    // Store blkwise SAD for later use
-    if (cpi->sf.rt_sf.sad_based_comp_prune && (cm->spatial_layer_id == 0) &&
-        (cm->width == cm->render_width) && (cm->height == cm->render_height)) {
-      full_sampling = 1;
-      if (cpi->src_sad_blk_64x64 == NULL) {
-        CHECK_MEM_ERROR(
-            cm, cpi->src_sad_blk_64x64,
-            (uint64_t *)aom_calloc(sb_cols * sb_rows,
-                                   sizeof(*cpi->src_sad_blk_64x64)));
-      }
+  int num_mi_cols = cm->mi_params.mi_cols;
+  int num_mi_rows = cm->mi_params.mi_rows;
+  if (cpi->svc.number_spatial_layers > 1) {
+    num_mi_cols = cpi->svc.mi_cols_full_resoln;
+    num_mi_rows = cpi->svc.mi_rows_full_resoln;
+  }
+  int num_zero_temp_sad = 0;
+  uint32_t min_thresh = 10000;
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) min_thresh = 100000;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+  uint64_t avg_sad = 0;
+  uint64_t tmp_sad = 0;
+  int num_samples = 0;
+  const int thresh = 6;
+  // SAD is computed on 64x64 blocks
+  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                ? (cm->seq_params->mib_size >> 1)
+                                : cm->seq_params->mib_size;
+  const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+  const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+  int num_low_var_high_sumdiff = 0;
+  int light_change = 0;
+  // Flag to check light change or not.
+  const int check_light_change = 0;
+  // Store blkwise SAD for later use
+  if (width == cm->render_width && height == cm->render_height) {
+    if (cpi->src_sad_blk_64x64 == NULL) {
+      CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64,
+                      (uint64_t *)aom_calloc(sb_cols * sb_rows,
+                                             sizeof(*cpi->src_sad_blk_64x64)));
     }
-    for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
-      for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
-        // Checker-board pattern, ignore boundary.
-        if (full_sampling ||
-            ((sbi_row > 0 && sbi_col > 0) &&
-             (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
-             ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
-              (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
-          tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
-                                                last_src_ystride);
-          if (cpi->src_sad_blk_64x64 != NULL)
-            cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
-          if (check_light_change) {
-            unsigned int sse, variance;
-            variance = cpi->ppi->fn_ptr[bsize].vf(
-                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
-            // Note: sse - variance = ((sum * sum) >> 12)
-            // Detect large lighting change.
-            if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
-              num_low_var_high_sumdiff++;
-            }
-          }
-          avg_sad += tmp_sad;
-          num_samples++;
-          if (tmp_sad == 0) num_zero_temp_sad++;
+  }
+  for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+    for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+      tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                            last_src_ystride);
+      if (cpi->src_sad_blk_64x64 != NULL)
+        cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
+      if (check_light_change) {
+        unsigned int sse, variance;
+        variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                              last_src_ystride, &sse);
+        // Note: sse - variance = ((sum * sum) >> 12)
+        // Detect large lighting change.
+        if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
+          num_low_var_high_sumdiff++;
         }
-        src_y += 64;
-        last_src_y += 64;
       }
-      src_y += (src_ystride << 6) - (sb_cols << 6);
-      last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+      avg_sad += tmp_sad;
+      num_samples++;
+      if (tmp_sad == 0) num_zero_temp_sad++;
+      if (tmp_sad > rc->max_block_source_sad)
+        rc->max_block_source_sad = tmp_sad;
+
+      src_y += 64;
+      last_src_y += 64;
+    }
+    src_y += (src_ystride << 6) - (sb_cols << 6);
+    last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+  }
+  if (check_light_change && num_samples > 0 &&
+      num_low_var_high_sumdiff > (num_samples >> 1))
+    light_change = 1;
+  if (num_samples > 0) avg_sad = avg_sad / num_samples;
+  // Set high_source_sad flag if we detect very high increase in avg_sad
+  // between current and previous frame value(s). Use minimum threshold
+  // for cases where there is small change from content that is completely
+  // static.
+  if (!light_change &&
+      avg_sad >
+          AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
+      rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+      num_zero_temp_sad < 3 * (num_samples >> 2))
+    rc->high_source_sad = 1;
+  else
+    rc->high_source_sad = 0;
+  rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+  rc->frame_source_sad = avg_sad;
+  if (num_samples > 0)
+    rc->percent_blocks_with_motion =
+        ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+  // Scene detection is only on base SLO, and using full/orignal resolution.
+  // Pass the state to the upper spatial layers.
+  if (cpi->svc.number_spatial_layers > 1) {
+    SVC *svc = &cpi->svc;
+    for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+      int tl = svc->temporal_layer_id;
+      const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      lrc->high_source_sad = rc->high_source_sad;
+      lrc->frame_source_sad = rc->frame_source_sad;
+      lrc->avg_source_sad = rc->avg_source_sad;
+      lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion;
+      lrc->max_block_source_sad = rc->max_block_source_sad;
     }
-    if (check_light_change && num_samples > 0 &&
-        num_low_var_high_sumdiff > (num_samples >> 1))
-      light_change = 1;
-    if (num_samples > 0) avg_sad = avg_sad / num_samples;
-    // Set high_source_sad flag if we detect very high increase in avg_sad
-    // between current and previous frame value(s). Use minimum threshold
-    // for cases where there is small change from content that is completely
-    // static.
-    if (!light_change &&
-        avg_sad >
-            AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
-        rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
-        num_zero_temp_sad < 3 * (num_samples >> 2))
-      rc->high_source_sad = 1;
-    else
-      rc->high_source_sad = 0;
-    rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
-    rc->frame_source_sad = avg_sad;
-
-    if (num_zero_temp_sad < (3 * num_samples >> 2))
-      rc->high_num_blocks_with_motion = 1;
   }
-  cpi->svc.high_source_sad_superframe = rc->high_source_sad;
 }
 
 /*!\brief Set the GF baseline interval for 1 pass real-time mode.
@@ -2870,12 +3032,11 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
   RATE_CONTROL *const rc = &cpi->rc;
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SVC *const svc = &cpi->svc;
-  double tot_scale_change = 1.0;
   int target_bits_per_frame;
   int active_worst_quality;
   int qindex;
-  tot_scale_change = (double)(resize_width * resize_height) /
-                     (double)(prev_width * prev_height);
+  double tot_scale_change = (double)(resize_width * resize_height) /
+                            (double)(prev_width * prev_height);
   // Reset buffer level to optimal, update target size.
   p_rc->buffer_level = p_rc->optimal_buffer_level;
   p_rc->bits_off_target = p_rc->optimal_buffer_level;
@@ -2893,20 +3054,8 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
   // If resize is down, check if projected q index is close to worst_quality,
   // and if so, reduce the rate correction factor (since likely can afford
   // lower q for resized frame).
-  if (tot_scale_change < 1.0 && qindex > 90 * cpi->rc.worst_quality / 100)
+  if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100)
     p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
-  // Apply the same rate control reset to all temporal layers.
-  for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
-    LAYER_CONTEXT *lc = NULL;
-    lc = &svc->layer_context[svc->spatial_layer_id *
-                                 svc->number_temporal_layers +
-                             tl];
-    lc->rc.resize_state = rc->resize_state;
-    lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
-    lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
-    lc->p_rc.rate_correction_factors[INTER_FRAME] =
-        p_rc->rate_correction_factors[INTER_FRAME];
-  }
   // If resize is back up: check if projected q index is too much above the
   // previous index, and if so, reduce the rate correction factor
   // (since prefer to keep q for resized frame at least closet to previous q).
@@ -2917,7 +3066,21 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
         qindex > 130 * p_rc->last_q[INTER_FRAME] / 100)
       p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
     if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100)
-      p_rc->rate_correction_factors[INTER_NORMAL] *= 2.0;
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5;
+  }
+  // Apply the same rate control reset to all temporal layers.
+  for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
+    LAYER_CONTEXT *lc = NULL;
+    lc = &svc->layer_context[svc->spatial_layer_id *
+                                 svc->number_temporal_layers +
+                             tl];
+    lc->rc.resize_state = rc->resize_state;
+    lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
+    lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
+    lc->p_rc.rate_correction_factors[INTER_NORMAL] =
+        p_rc->rate_correction_factors[INTER_NORMAL];
+    lc->p_rc.avg_frame_qindex[INTER_FRAME] =
+        p_rc->avg_frame_qindex[INTER_FRAME];
   }
 }
 
@@ -2930,7 +3093,7 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
  * \ingroup rate_control
  * \param[in]       cpi          Top level encoder structure
  *
- * \return Return resized width/height in \c cpi->resize_pending_params,
+ * \remark Return resized width/height in \c cpi->resize_pending_params,
  * and update some resize counters in \c rc.
  */
 static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
@@ -3042,8 +3205,7 @@ static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
   return 0;
 }
 
-void av1_get_one_pass_rt_params(AV1_COMP *cpi,
-                                EncodeFrameParams *const frame_params,
+void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
                                 const EncodeFrameInput *frame_input,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3065,7 +3227,7 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
   }
   // Set frame type.
   if (set_key_frame(cpi, frame_flags)) {
-    frame_params->frame_type = KEY_FRAME;
+    *frame_type = KEY_FRAME;
     p_rc->this_key_frame_forced =
         cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
@@ -3079,7 +3241,7 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
       svc->layer_context[layer].is_key_frame = 1;
     }
   } else {
-    frame_params->frame_type = INTER_FRAME;
+    *frame_type = INTER_FRAME;
     gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
     gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
     gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
@@ -3089,12 +3251,13 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
           svc->spatial_layer_id == 0
               ? 0
               : svc->layer_context[svc->temporal_layer_id].is_key_frame;
-      // If the user is setting the SVC pattern with set_ref_frame_config and
-      // did not set any references, set the frame type to Intra-only.
-      if (svc->set_ref_frame_config) {
+      // If the user is setting the reference structure with
+      // set_ref_frame_config and did not set any references, set the
+      // frame type to Intra-only.
+      if (cpi->ppi->rtc_ref.set_ref_frame_config) {
         int no_references_set = 1;
         for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
-          if (svc->reference[i]) {
+          if (cpi->ppi->rtc_ref.reference[i]) {
             no_references_set = 0;
             break;
           }
@@ -3103,13 +3266,20 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
         // The stream can start decoding on INTRA_ONLY_FRAME so long as the
         // layer with the intra_only_frame doesn't signal a reference to a slot
         // that hasn't been set yet.
-        if (no_references_set) frame_params->frame_type = INTRA_ONLY_FRAME;
+        if (no_references_set) *frame_type = INTRA_ONLY_FRAME;
       }
     }
   }
   // Check for scene change: for SVC check on base spatial layer only.
-  if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0)
-    rc_scene_detection_onepass_rt(cpi, frame_input);
+  if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) {
+    if (rc->prev_coded_width == cm->width &&
+        rc->prev_coded_height == cm->height) {
+      rc_scene_detection_onepass_rt(cpi, frame_input);
+    } else if (cpi->src_sad_blk_64x64) {
+      aom_free(cpi->src_sad_blk_64x64);
+      cpi->src_sad_blk_64x64 = NULL;
+    }
+  }
   // Check for dynamic resize, for single spatial layer for now.
   // For temporal layers only check on base temporal layer.
   if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
@@ -3132,19 +3302,17 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
   }
   // Set the GF interval and update flag.
   if (!rc->rtc_external_ratectrl)
-    set_gf_interval_update_onepass_rt(cpi, frame_params->frame_type);
+    set_gf_interval_update_onepass_rt(cpi, *frame_type);
   // Set target size.
   if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
-    if (frame_params->frame_type == KEY_FRAME ||
-        frame_params->frame_type == INTRA_ONLY_FRAME) {
+    if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
       target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
     } else {
       target = av1_calc_pframe_target_size_one_pass_cbr(
           cpi, gf_group->update_type[cpi->gf_frame_index]);
     }
   } else {
-    if (frame_params->frame_type == KEY_FRAME ||
-        frame_params->frame_type == INTRA_ONLY_FRAME) {
+    if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
       target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
     } else {
       target = av1_calc_pframe_target_size_one_pass_vbr(
@@ -3156,7 +3324,7 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
 
   av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
   rc->base_frame_target = target;
-  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->current_frame.frame_type = *frame_type;
   // For fixed mode SVC: if KSVC is enabled remove inter layer
   // prediction on spatial enhancement layer frames for frames
   // whose base is not KEY frame.
@@ -3170,76 +3338,64 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
 
 int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
   AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  SPEED_FEATURES *const sf = &cpi->sf;
-  int thresh_qp = 7 * (rc->worst_quality >> 3);
-  // Lower thresh_qp for video (more overshoot at lower Q) to be
-  // more conservative for video.
-  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN)
-    thresh_qp = 3 * (rc->worst_quality >> 2);
-  if (sf->rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
-      cm->quant_params.base_qindex < thresh_qp) {
-    double rate_correction_factor =
-        cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
-    const int target_size = cpi->rc.avg_frame_bandwidth;
-    double new_correction_factor;
-    int target_bits_per_mb;
-    double q2;
-    int enumerator;
-    *q = (3 * cpi->rc.worst_quality + *q) >> 2;
-    // For screen content use the max-q set by the user to allow for less
-    // overshoot on slide changes.
-    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
-      *q = cpi->rc.worst_quality;
-    cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
-    // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
-    // these parameters will affect QP selection for subsequent frames. If they
-    // have settled down to a very different (low QP) state, then not adjusting
-    // them may cause next frame to select low QP and overshoot again.
-    p_rc->avg_frame_qindex[INTER_FRAME] = *q;
-    p_rc->buffer_level = p_rc->optimal_buffer_level;
-    p_rc->bits_off_target = p_rc->optimal_buffer_level;
-    // Reset rate under/over-shoot flags.
-    cpi->rc.rc_1_frame = 0;
-    cpi->rc.rc_2_frame = 0;
-    // Adjust rate correction factor.
-    target_bits_per_mb =
-        (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
-    // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
-    // This comes from the inverse computation of vp9_rc_bits_per_mb().
-    q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
-    enumerator = 1800000;  // Factor for inter frame.
-    enumerator += (int)(enumerator * q2) >> 12;
-    new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
-    if (new_correction_factor > rate_correction_factor) {
-      rate_correction_factor =
-          AOMMIN(2.0 * rate_correction_factor, new_correction_factor);
-      if (rate_correction_factor > MAX_BPB_FACTOR)
-        rate_correction_factor = MAX_BPB_FACTOR;
-      cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
-          rate_correction_factor;
-    }
-    // For temporal layers: reset the rate control parameters across all
-    // temporal layers.
-    if (cpi->svc.number_temporal_layers > 1) {
-      SVC *svc = &cpi->svc;
-      for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
-        int sl = svc->spatial_layer_id;
-        const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
-        PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
-        lp_rc->avg_frame_qindex[INTER_FRAME] = *q;
-        lp_rc->buffer_level = lp_rc->optimal_buffer_level;
-        lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
-        lrc->rc_1_frame = 0;
-        lrc->rc_2_frame = 0;
-        lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
-      }
+  double rate_correction_factor =
+      cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
+  const int target_size = cpi->rc.avg_frame_bandwidth;
+  double new_correction_factor;
+  int target_bits_per_mb;
+  double q2;
+  int enumerator;
+  int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+  *q = (3 * cpi->rc.worst_quality + *q) >> 2;
+  // For screen content use the max-q set by the user to allow for less
+  // overshoot on slide changes.
+  if (is_screen_content) *q = cpi->rc.worst_quality;
+  cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+  // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+  // these parameters will affect QP selection for subsequent frames. If they
+  // have settled down to a very different (low QP) state, then not adjusting
+  // them may cause next frame to select low QP and overshoot again.
+  p_rc->avg_frame_qindex[INTER_FRAME] = *q;
+  p_rc->buffer_level = p_rc->optimal_buffer_level;
+  p_rc->bits_off_target = p_rc->optimal_buffer_level;
+  // Reset rate under/over-shoot flags.
+  cpi->rc.rc_1_frame = 0;
+  cpi->rc.rc_2_frame = 0;
+  // Adjust rate correction factor.
+  target_bits_per_mb =
+      (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
+  // Reset rate correction factor: for now base it on target_bits_per_mb
+  // and qp (==max_QP). This comes from the inverse computation of
+  // av1_rc_bits_per_mb().
+  q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
+  enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content);
+  new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+  if (new_correction_factor > rate_correction_factor) {
+    rate_correction_factor =
+        (new_correction_factor + rate_correction_factor) / 2.0;
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+    cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
+        rate_correction_factor;
+  }
+  // For temporal layers: reset the rate control parameters across all
+  // temporal layers.
+  if (cpi->svc.number_temporal_layers > 1) {
+    SVC *svc = &cpi->svc;
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int sl = svc->spatial_layer_id;
+      const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+      lp_rc->avg_frame_qindex[INTER_FRAME] = *q;
+      lp_rc->buffer_level = lp_rc->optimal_buffer_level;
+      lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
+      lrc->rc_1_frame = 0;
+      lrc->rc_2_frame = 0;
+      lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
     }
-    return 1;
-  } else {
-    return 0;
   }
+  return 1;
 }
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 6dc5baf85..114778d86 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -228,7 +228,7 @@ typedef struct {
 
   /*!\endcond */
   /*!
-   * Proposed maximum alloed Q for current frame
+   * Proposed maximum allowed Q for current frame
    */
   int active_worst_quality;
 
@@ -238,7 +238,10 @@ typedef struct {
   int cnt_zeromv;
 
   // signals if number of blocks with motion is high
-  int high_num_blocks_with_motion;
+  int percent_blocks_with_motion;
+
+  // Maximum value of source sad across all blocks of frame.
+  uint64_t max_block_source_sad;
 
   // For dynamic resize, 1 pass cbr.
   RESIZE_STATE resize_state;
@@ -253,6 +256,11 @@ typedef struct {
   int frame_level_fast_extra_bits;
 
   double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frame_num_last_gf_refresh;
+
+  int prev_coded_width;
+  int prev_coded_height;
   /*!\endcond */
 } RATE_CONTROL;
 
@@ -539,6 +547,8 @@ typedef struct {
   int q_history[MAX_Q_HISTORY];
 } PRIMARY_RATE_CONTROL;
 
+/*!\cond */
+
 struct AV1_COMP;
 struct AV1EncoderConfig;
 struct GF_GROUP;
@@ -548,9 +558,8 @@ void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
 
 void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc);
 
-int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
-                           double correction_factor, aom_bit_depth_t bit_depth,
-                           const int is_screen_content_type);
+int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q,
+                           double correction_factor);
 
 double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
 
@@ -582,7 +591,6 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
 
 // Functions to set parameters for encoding before the actual
 // encode_frame_to_data_rate() function.
-struct EncodeFrameParams;
 struct EncodeFrameInput;
 
 // Post encode update of the rate control parameters based
@@ -603,7 +611,7 @@ void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
  * \param[in]   width                 Frame width
  * \param[in]   height                Frame height
  *
- * \return None but updates the relevant rate correction factor in cpi->rc
+ * \remark Updates the relevant rate correction factor in cpi->rc
  */
 void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi,
                                            int is_encode_stage, int width,
@@ -634,7 +642,7 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
  * \return Returns selected q index to be used for encoding this frame.
  * Also, updates \c rc->arf_q.
  */
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
                              int gf_index, int *bottom_index, int *top_index);
 
 /*!\brief Estimates q to achieve a target bits per frame
@@ -654,10 +662,14 @@ int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
                       int width, int height);
 
 /*!\cond */
+// Gets the appropriate bpmb ennumerator based on the frame and content type
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+                            const int is_screen_content_type);
+
 // Estimates bits per mb for a given qindex and correction factor.
-int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor, aom_bit_depth_t bit_depth,
-                       const int is_screen_content_type);
+int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type,
+                       int qindex, double correction_factor,
+                       int accurate_estimate);
 
 // Clamping utilities for bitrate targets for iframes and pframes.
 int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
@@ -679,10 +691,9 @@ int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to the given rate ratio.
-int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
-                               int qindex, double rate_target_ratio,
-                               const int is_screen_content_type,
-                               aom_bit_depth_t bit_depth);
+int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi,
+                               FRAME_TYPE frame_type, int qindex,
+                               double rate_target_ratio);
 
 int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
 
@@ -700,8 +711,8 @@ void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
 
 void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
 
-void av1_set_reference_structure_one_pass_rt(struct AV1_COMP *cpi,
-                                             int gf_update);
+void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi,
+                                               int gf_update);
 
 /*!\endcond */
 /*!\brief Calculates how many bits to use for a P frame in one pass vbr
@@ -766,15 +777,16 @@ int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
  *
  * \ingroup rate_control
  * \param[in]       cpi          Top level encoder structure
- * \param[in]       frame_params Encoder frame parameters
+ * \param[in]       frame_type   Encoder frame type
  * \param[in]       frame_input  Current and last input source frames
- * \param[in]       frame_flags  Emcoder frame flags
+ * \param[in]       frame_flags  Encoder frame flags
  *
- * \return Nothing is returned. Instead the settings computed in this
- * funtion are set in: \c frame_params, \c cpi->common, \c cpi->rc, \c cpi->svc.
+ * \remark Nothing is returned. Instead the settings computed in this
+ * function are set in: \c frame_params, \c cpi->common, \c cpi->rc,
+ * \c cpi->svc.
  */
 void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
-                                struct EncodeFrameParams *const frame_params,
+                                FRAME_TYPE *const frame_type,
                                 const struct EncodeFrameInput *frame_input,
                                 unsigned int frame_flags);
 
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 665ea484b..033496202 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -32,6 +32,7 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/nonrd_opt.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 
@@ -407,22 +408,18 @@ int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
   return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1;
 }
 
-int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  const aom_bit_depth_t bit_depth = cpi->common.seq_params->bit_depth;
-  const FRAME_UPDATE_TYPE update_type =
-      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+                        const FRAME_UPDATE_TYPE update_type,
+                        const int layer_depth, const int boost_index,
+                        const FRAME_TYPE frame_type,
+                        const int use_fixed_qp_offsets,
+                        const int is_stat_consumption_stage) {
   int64_t rdmult =
       av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
-  if (is_stat_consumption_stage(cpi) && !cpi->oxcf.q_cfg.use_fixed_qp_offsets &&
-      (cpi->common.current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
-    const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
-    const int layer_depth =
-        AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
-
+  if (is_stat_consumption_stage && !use_fixed_qp_offsets &&
+      (frame_type != KEY_FRAME)) {
     // Layer depth adjustment
     rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
-
     // ARF boost adjustment
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
   }
@@ -473,10 +470,20 @@ int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
 int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
   assert(beta > 0.0);
   const AV1_COMMON *cm = &cpi->common;
-  int q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                           cm->seq_params->bit_depth);
 
-  return (int)(av1_compute_rd_mult(cpi, q) / beta);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  const int qindex_rdmult = cm->quant_params.base_qindex;
+  return (int)(av1_compute_rd_mult(
+                   qindex_rdmult, cm->seq_params->bit_depth,
+                   cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+                   layer_depth, boost_index, frame_type,
+                   cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+                   is_stat_consumption_stage(cpi)) /
+               beta);
 }
 
 static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
@@ -507,8 +514,26 @@ void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
   }
 }
 
-static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd,
+                                 int use_nonrd_pick_mode) {
   int i, bsize, segment_id;
+  THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 };
+  int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES;
+
+  if (use_nonrd_pick_mode) {
+    for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) {
+      const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+      if (ref != INTRA_FRAME) {
+        for (i = 0; i < RTC_INTER_MODES; i++)
+          mode_indices[num_modes_count++] =
+              mode_idx[ref][mode_offset(inter_mode_list[i])];
+      } else {
+        for (i = 0; i < RTC_INTRA_MODES; i++)
+          mode_indices[num_modes_count++] =
+              mode_idx[ref][mode_offset(intra_mode_list[i])];
+      }
+    }
+  }
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
     const int qindex = clamp(
@@ -523,10 +548,13 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
       const int t = q * rd_thresh_block_size_factor[bsize];
       const int thresh_max = INT_MAX / t;
 
-      for (i = 0; i < MAX_MODES; ++i)
-        rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
-                                                 ? rd->thresh_mult[i] * t / 4
-                                                 : INT_MAX;
+      for (i = 0; i < num_modes_count; ++i) {
+        const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i;
+        rd->threshes[segment_id][bsize][mode_index] =
+            rd->thresh_mult[mode_index] < thresh_max
+                ? rd->thresh_mult[mode_index] * t / 4
+                : INT_MAX;
+      }
     }
   }
 }
@@ -734,8 +762,18 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
   int frames_since_key = cpi->rc.frames_since_key;
 
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+  const int qindex_rdmult =
+      cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q;
   rd->RDMULT = av1_compute_rd_mult(
-      cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q);
+      qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
 #if CONFIG_RD_COMMAND
   if (cpi->oxcf.pass == 2) {
     const RD_COMMAND *rd_command = &cpi->rd_command;
@@ -748,7 +786,7 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
 
   av1_set_error_per_bit(&x->errorperbit, rd->RDMULT);
 
-  set_block_thresholds(cm, rd);
+  set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode);
 
   populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf);
   const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 8d0277e3b..b1eb154d5 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -56,6 +56,28 @@ extern "C" {
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+#define RTC_REFS 4
+static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = {
+  { LAST_FRAME, NONE_FRAME },
+  { ALTREF_FRAME, NONE_FRAME },
+  { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME }
+};
+
+static INLINE int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED: return 0;
+      case V_PRED: return 1;
+      case H_PRED: return 2;
+      case SMOOTH_PRED: return 3;
+      default: assert(0); return -1;
+    }
+  }
+}
+
 enum {
   // Default initialization when we are not using winner mode framework. e.g.
   // intrabc
@@ -133,7 +155,9 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   if (!rd_stats_dst->zero_rate)
     rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
   rd_stats_dst->dist += rd_stats_src->dist;
-  rd_stats_dst->sse += rd_stats_src->sse;
+  if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) {
+    rd_stats_dst->sse += rd_stats_src->sse;
+  }
   rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
@@ -204,7 +228,12 @@ int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
                                         FRAME_UPDATE_TYPE update_type,
                                         int qindex);
 
-int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+                        const FRAME_UPDATE_TYPE update_type,
+                        const int layer_depth, const int boost_index,
+                        const FRAME_TYPE frame_type,
+                        const int use_fixed_qp_offsets,
+                        const int is_stat_consumption_stage);
 
 void av1_initialize_rd_consts(struct AV1_COMP *cpi);
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index a51b20017..c25db61b2 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2533,7 +2533,7 @@ static AOM_INLINE int prune_zero_mv_with_sse(
  * is currently only used by realtime mode as \ref
  * av1_interpolation_filter_search is not called during realtime encoding.
  *
- * This funciton only searches over two possible filters. EIGHTTAP_REGULAR is
+ * This function only searches over two possible filters. EIGHTTAP_REGULAR is
  * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For
  * higher  res slips (>240p), EIGHTTAP_SMOOTH is also searched.
  *  *
@@ -3295,7 +3295,7 @@ void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
   const int num_planes = av1_num_planes(cm);
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
-  int y_skip_txfm = 0, uv_skip_txfm = 0;
+  uint8_t y_skip_txfm = 0, uv_skip_txfm = 0;
   int64_t dist_y = 0, dist_uv = 0;
 
   ctx->rd_stats.skip_txfm = 0;
@@ -3703,13 +3703,6 @@ static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
   { ALTREF_FRAME, INTRA_FRAME },  { BWDREF_FRAME, INTRA_FRAME },
 };
 
-static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = {
-  { LAST_FRAME, NONE_FRAME },
-  { ALTREF_FRAME, NONE_FRAME },
-  { GOLDEN_FRAME, NONE_FRAME },
-  { INTRA_FRAME, NONE_FRAME }
-};
-
 typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
 
 static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
@@ -3892,7 +3885,7 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
   }
 
   mask->pred_modes[INTRA_FRAME] |=
-      ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
 }
 
 static AOM_INLINE void init_neighbor_pred_buf(
@@ -5326,19 +5319,11 @@ static void handle_winner_cand(
  * InterModeSearchState::intra_search_state so it can be reused later by \ref
  * av1_search_palette_mode.
  *
- * \return Returns the rdcost of the current intra-mode if it's available,
- * otherwise returns INT64_MAX. The corresponding values in x->e_mbd.mi[0],
- * rd_stats, rd_stats_y/uv, and best_intra_rd are also updated. Moreover, in the
- * first evocation of the function, the chroma intra mode result is cached in
- * intra_search_state to be used in subsequent calls. In the first evaluation
- * with directional mode, a prune_mask computed with histogram of gradient is
- * also stored in intra_search_state.
- *
  * \param[in,out] search_state      Struct keep track of the prediction mode
  *                                  search state in interframe.
  *
  * \param[in]     cpi               Top-level encoder structure.
- * \param[in]     x                 Pointer to struct holding all the data for
+ * \param[in,out] x                 Pointer to struct holding all the data for
  *                                  the current prediction block.
  * \param[out]    rd_cost           Stores the best rd_cost among all the
  *                                  prediction modes searched.
@@ -5346,21 +5331,21 @@ static void handle_winner_cand(
  * \param[in,out] ctx               Structure to hold the number of 4x4 blks to
  *                                  copy the tx_type and txfm_skip arrays.
  *                                  for only the Y plane.
- * \param[in,out] sf_args           Stores the list of intra mode candidates
+ * \param[in]     sf_args           Stores the list of intra mode candidates
  *                                  to be searched.
  * \param[in]     intra_ref_frame_cost  The entropy cost for signaling that the
  *                                      current ref frame is an intra frame.
  * \param[in]     yrd_threshold     The rdcost threshold for luma intra mode to
  *                                  terminate chroma intra mode search.
  *
- * \return Returns INT64_MAX if the determined motion mode is invalid and the
- * current motion mode being tested should be skipped. It returns 0 if the
- * motion mode search is a success.
+ * \remark If a new best mode is found, search_state and rd_costs are updated
+ * correspondingly. While x is also modified, it is only used as a temporary
+ * buffer, and the final decisions are stored in search_state.
  */
 static AOM_INLINE void search_intra_modes_in_interframe(
     InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
     RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-    InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
+    const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
     int64_t yrd_threshold) {
   const AV1_COMMON *const cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 2fead8fc7..78a23d621 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -54,7 +54,7 @@ struct RD_STATS;
                                 during the mode picking process.
  * \param[in]    best_rd Best   RD seen for this block so far.
  *
- * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
  * is modified to store information about the best mode computed
  * in this function. The rd_cost struct is also updated with the RD stats
  * corresponding to the best mode found.
@@ -85,7 +85,7 @@ void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
                                 during the mode picking process
  * \param[in]    best_rd_so_far Best RD seen for this block so far
  *
- * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
  * is modified to store information about the best mode computed
  * in this function. The rd_cost struct is also updated with the RD stats
  * corresponding to the best mode found.
@@ -115,7 +115,7 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
  * \param[in]    ctx            Structure to hold snapshot of coding context
                                 during the mode picking process
  *
- * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
  * is modified to store information about the best mode computed
  * in this function. The rd_cost struct is also updated with the RD stats
  * corresponding to the best mode found.
@@ -147,7 +147,7 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
  * \param[in]    ctx            Structure to hold snapshot of coding context
                                 during the mode picking process
  *
- * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
  * is modified to store information about the best mode computed
  * in this function. The rd_cost struct is also updated with the RD stats
  * corresponding to the best mode found.
@@ -166,11 +166,6 @@ void av1_rd_pick_inter_mode_sb_seg_skip(
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 
-void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                   int mi_col, RD_STATS *this_rdc, int *skippable,
-                   BLOCK_SIZE bsize, TX_SIZE tx_size, TX_TYPE tx_type,
-                   int is_inter_mode);
-
 static INLINE int coded_to_superres_mi(int mi_col, int denom) {
   return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
 }
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 7eadbda94..ac7dc16fb 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -28,65 +28,33 @@
 #include "av1/common/reconintra.h"
 #include "av1/encoder/reconinter_enc.h"
 
-static void enc_calc_subpel_params(const MV *const src_mv,
-                                   InterPredParams *const inter_pred_params,
-                                   MACROBLOCKD *xd, int mi_x, int mi_y, int ref,
-                                   uint8_t **mc_buf, uint8_t **pre,
-                                   SubpelParams *subpel_params,
-                                   int *src_stride) {
-  // These are part of the function signature to use this function through a
-  // function pointer. See typedef of 'CalcSubpelParamsFunc'.
-  (void)xd;
-  (void)mi_x;
-  (void)mi_y;
-  (void)ref;
-  (void)mc_buf;
-
-  const struct scale_factors *sf = inter_pred_params->scale_factors;
-
+static AOM_INLINE void enc_calc_subpel_params(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
   struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
-  int ssx = inter_pred_params->subsampling_x;
-  int ssy = inter_pred_params->subsampling_y;
-  int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
-  orig_pos_y += src_mv->row * (1 << (1 - ssy));
-  int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
-  orig_pos_x += src_mv->col * (1 << (1 - ssx));
-  int pos_y = sf->scale_value_y(orig_pos_y, sf);
-  int pos_x = sf->scale_value_x(orig_pos_x, sf);
-  pos_x += SCALE_EXTRA_OFF;
-  pos_y += SCALE_EXTRA_OFF;
-
-  const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-  const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-  const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-  const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-  pos_y = clamp(pos_y, top, bottom);
-  pos_x = clamp(pos_x, left, right);
-
-  subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
-  subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
-  subpel_params->xs = sf->x_step_q4;
-  subpel_params->ys = sf->y_step_q4;
-  *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-         (pos_x >> SCALE_SUBPEL_BITS);
+  init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width,
+                     pre_buf->height);
+  *pre = pre_buf->buf0 +
+         (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+         (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
   *src_stride = pre_buf->stride;
 }
 
+#define IS_DEC 0
+#include "av1/common/reconinter_template.inc"
+#undef IS_DEC
+
 void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
                                        const MV *src_mv,
                                        InterPredParams *inter_pred_params) {
-  av1_build_one_inter_predictor(
-      dst, dst_stride, src_mv, inter_pred_params, NULL /* xd */, 0 /* mi_x */,
-      0 /* mi_y */, inter_pred_params->conv_params.do_average /* ref */,
-      NULL /* mc_buf */, enc_calc_subpel_params);
+  build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params);
 }
 
 static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                        int plane, const MB_MODE_INFO *mi,
                                        int bw, int bh, int mi_x, int mi_y) {
-  av1_build_inter_predictors(cm, xd, plane, mi, 0 /* build_for_obmc */, bw, bh,
-                             mi_x, mi_y, NULL /* mc_buf */,
-                             enc_calc_subpel_params);
+  build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x,
+                         mi_y);
 }
 
 void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -113,6 +81,32 @@ void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
                                     &inter_pred_params);
 }
 
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+                                           InterPredParams *inter_pred_params,
+                                           const SubpelParams *subpel_params) {
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  struct buf_2d *const dst_buf = &pd->dst;
+  const struct buf_2d *pre_buf = &pd->pre[0];
+  const uint8_t *src =
+      pre_buf->buf0 +
+      (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+      (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+  uint8_t *const dst = dst_buf->buf;
+  int src_stride = pre_buf->stride;
+  int dst_stride = dst_buf->stride;
+  inter_pred_params->ref_frame_buf = *pre_buf;
+
+  // Initialize interp filter for single reference mode.
+  init_interp_filter_params(inter_pred_params->interp_filter_params,
+                            &mbmi->interp_filters.as_filters, pd->width,
+                            pd->height, /*is_intrabc=*/0);
+
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params,
+                           subpel_params);
+}
+
 void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col,
                                    const BUFFER_SET *ctx, BLOCK_SIZE bsize,
diff --git a/av1/encoder/reconinter_enc.h b/av1/encoder/reconinter_enc.h
index 5d32545f5..e187a5f1c 100644
--- a/av1/encoder/reconinter_enc.h
+++ b/av1/encoder/reconinter_enc.h
@@ -40,6 +40,10 @@ void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
 void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col);
 
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+                                           InterPredParams *inter_pred_params,
+                                           const SubpelParams *subpel_params);
+
 // Build one inter predictor. It is called for building predictor for single
 // reference case, or just the 1st or 2nd reference in compound reference case.
 // Can build both regular and masked predictors.
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index d31583885..4b4e78779 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -35,7 +35,7 @@ void av1_disable_segmentation(struct segmentation *seg) {
 
 void av1_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
-  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+  seg->feature_mask[segment_id] &= ~(1u << feature_id);
 }
 
 void av1_clear_segdata(struct segmentation *seg, int segment_id,
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index b43f80f54..9fbfbd779 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -554,6 +554,7 @@ static void set_allintra_speed_features_framesize_independent(
     sf->rt_sf.hybrid_intra_pickmode = 0;
     sf->rt_sf.var_part_split_threshold_shift = 9;
     sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
+    sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
   }
 
   // As the speed feature prune_chroma_modes_using_luma_winner already
@@ -806,6 +807,7 @@ static void set_good_speed_feature_framesize_dependent(
     if (!is_480p_or_larger) {
       sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
           boosted ? INT_MAX : 250;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
     }
 
     if (is_480p_or_lesser) {
@@ -840,8 +842,10 @@ static void set_good_speed_feature_framesize_dependent(
 
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 28);
     } else {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
     }
 
     if (is_720p_or_larger) {
@@ -1102,10 +1106,9 @@ static void set_good_speed_features_framesize_independent(
     sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
     sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
     sf->winner_mode_sf.motion_mode_for_winner_cand =
-        boosted ? 0
-                : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE
-                      ? 1
-                      : 2;
+        boosted                                                          ? 0
+        : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1
+                                                                         : 2;
     sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4;
 
     // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
@@ -1138,7 +1141,6 @@ static void set_good_speed_features_framesize_independent(
     sf->inter_sf.prune_ext_comp_using_neighbors = 2;
     sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
     sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
-    sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = boosted ? 0 : 1;
 
     sf->interp_sf.cb_pred_filter_search = 1;
     sf->interp_sf.skip_sharp_interp_filter_search = 1;
@@ -1227,8 +1229,9 @@ static void set_good_speed_features_framesize_independent(
     sf->part_sf.prune_rectangular_split_based_on_qidx =
         boosted || allow_screen_content_tools ? 0 : 2;
     sf->part_sf.prune_sub_8x8_partition_level =
-        allow_screen_content_tools ? 0
-                                   : frame_is_intra_only(&cpi->common) ? 1 : 2;
+        allow_screen_content_tools          ? 0
+        : frame_is_intra_only(&cpi->common) ? 1
+                                            : 2;
     sf->part_sf.prune_part4_search = 3;
 
     sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
@@ -1252,6 +1255,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
                                                      int speed) {
   const AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
@@ -1261,19 +1265,29 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
     sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
     if (speed >= 6)
       sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+    if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2;
     if (speed >= 7) {
       sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
       sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+      sf->rt_sf.use_rtc_tf = 2;
     }
-    if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 2;
+    if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1;
     if (speed >= 8) {
-      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->rt_sf.use_nonrd_filter_search = 1;
       sf->rt_sf.tx_size_level_based_on_qstep = 1;
     }
     if (speed >= 9) {
       sf->rt_sf.use_comp_ref_nonrd = 0;
-      sf->rt_sf.nonrd_agressive_skip = 1;
+      sf->rt_sf.nonrd_aggressive_skip = 1;
       sf->rt_sf.skip_intra_pred = 1;
+      // Only turn on enable_ref_short_signaling for low resolution when only
+      // LAST and GOLDEN ref frames are used.
+      sf->rt_sf.enable_ref_short_signaling =
+          (!sf->rt_sf.use_nonrd_altref_frame &&
+           (!sf->rt_sf.use_comp_ref_nonrd ||
+            (!sf->rt_sf.ref_frame_comp_nonrd[1] &&
+             !sf->rt_sf.ref_frame_comp_nonrd[2])));
+
 // TODO(kyslov) Re-enable when AV1 models are trained
 #if 0
 #if CONFIG_RT_ML_PARTITIONING
@@ -1283,10 +1297,14 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       }
 #endif
 #endif
+      sf->rt_sf.use_adaptive_subpel_search = false;
     }
     if (speed >= 10) {
       sf->rt_sf.skip_intra_pred = 2;
       sf->rt_sf.hybrid_intra_pickmode = 3;
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+      sf->rt_sf.use_nonrd_filter_search = 0;
     }
   } else {
     sf->rt_sf.prune_intra_mode_based_on_mv_range = 2;
@@ -1298,24 +1316,36 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
     }
     if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1;
     if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2;
+    if (speed == 7) {
+      sf->rt_sf.prefer_large_partition_blocks = 1;
+      // Enable this feature for [360p, 720p] resolution range initially.
+      if (!cpi->rc.rtc_external_ratectrl &&
+          AOMMIN(cm->width, cm->height) <= 720)
+        sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ;
+    }
+    if (speed >= 7) {
+      sf->rt_sf.use_rtc_tf = 1;
+    }
     if (speed == 8 && !cpi->ppi->use_svc) {
       sf->rt_sf.short_circuit_low_temp_var = 0;
       sf->rt_sf.use_nonrd_altref_frame = 1;
     }
-    if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 3;
     if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2;
     if (speed >= 9) {
       sf->rt_sf.gf_length_lvl = 1;
       sf->rt_sf.skip_cdef_sb = 1;
       sf->rt_sf.sad_based_adp_altref_lag = 2;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+      sf->rt_sf.use_adaptive_subpel_search = true;
+      sf->interp_sf.cb_pred_filter_search = 1;
     }
-
     if (speed >= 10) {
-      // TODO(yunqing): extend this sf to other speeds and/or other resolutions.
-      sf->rt_sf.use_rtc_tf = 1;
       sf->rt_sf.hybrid_intra_pickmode = 2;
       sf->rt_sf.sad_based_adp_altref_lag = 4;
       sf->rt_sf.tx_size_level_based_on_qstep = 0;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+      sf->rt_sf.use_adaptive_subpel_search = false;
+      sf->interp_sf.cb_pred_filter_search = 2;
     }
   }
   if (!is_480p_or_larger) {
@@ -1323,11 +1353,9 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.nonrd_check_partition_merge_mode = 2;
     }
     if (speed >= 8) {
-      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
       sf->rt_sf.estimate_motion_for_var_based_partition = 1;
     }
     if (speed >= 9) {
-      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
       sf->rt_sf.estimate_motion_for_var_based_partition = 0;
     }
   }
@@ -1337,17 +1365,37 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
     }
   } else {
     if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3;
-    if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1;
+    if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0;
+    if (speed >= 7) {
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+    }
     if (speed >= 9) {
       sf->rt_sf.sad_based_adp_altref_lag = 1;
-      sf->rt_sf.sad_based_comp_prune = 1;
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
     }
     if (speed >= 10) {
       sf->rt_sf.sad_based_adp_altref_lag = 3;
-      sf->rt_sf.sad_based_comp_prune = 2;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+    }
+  }
+  // TODO(Any): Check/Tune settings of other sfs for 1080p.
+  if (is_1080p_or_larger) {
+    if (speed >= 7) {
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+      sf->rt_sf.use_adaptive_subpel_search = 0;
     }
+    if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0;
+  } else {
+    if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1;
   }
-  if (cpi->ppi->use_svc) {
+
+  // Setting for SVC, or when the ref_frame_config control is
+  // used to set the reference structure.
+  if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) {
+    const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
     // For SVC: for greater than 2 temporal layers, use better mv search on
     // base temporal layers, and only on base spatial layer if highest
     // resolution is above 640x360.
@@ -1359,47 +1407,77 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->mv_sf.search_method = NSTEP;
       sf->mv_sf.subpel_search_method = SUBPEL_TREE;
       sf->rt_sf.fullpel_search_step_param = 6;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
     }
-    if (speed >= 9) {
+    if (speed >= 8) {
       sf->rt_sf.disable_cdf_update_non_reference_frame = true;
-      if (cpi->svc.non_reference_frame) sf->rt_sf.nonrd_agressive_skip = 1;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+      if (rtc_ref->non_reference_frame) {
+        sf->rt_sf.nonrd_aggressive_skip = 1;
+        sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      }
     }
-    if (cpi->svc.ref_frame_comp[0] || cpi->svc.ref_frame_comp[1] ||
-        cpi->svc.ref_frame_comp[2]) {
+    if (speed <= 9 && cpi->svc.number_temporal_layers > 2 &&
+        cpi->svc.temporal_layer_id == 0)
+      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false;
+    else
+      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+    // Compound mode enabling.
+    if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] ||
+        rtc_ref->ref_frame_comp[2]) {
       sf->rt_sf.use_comp_ref_nonrd = 1;
       sf->rt_sf.ref_frame_comp_nonrd[0] =
-          cpi->svc.ref_frame_comp[0] && cpi->svc.reference[GOLDEN_FRAME - 1];
+          rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1];
       sf->rt_sf.ref_frame_comp_nonrd[1] =
-          cpi->svc.ref_frame_comp[1] && cpi->svc.reference[LAST2_FRAME - 1];
+          rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1];
       sf->rt_sf.ref_frame_comp_nonrd[2] =
-          cpi->svc.ref_frame_comp[2] && cpi->svc.reference[ALTREF_FRAME - 1];
+          rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1];
     } else {
       sf->rt_sf.use_comp_ref_nonrd = 0;
-      sf->rt_sf.sad_based_comp_prune = 0;
     }
-    if (speed <= 9 && cpi->svc.number_temporal_layers > 2 &&
-        cpi->svc.temporal_layer_id == 0)
-      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false;
-    else
-      sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+
+    if (cpi->svc.number_spatial_layers > 1 ||
+        cpi->svc.number_temporal_layers > 1)
+      sf->hl_sf.accurate_bit_estimate = 0;
   }
+  // Screen settings.
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
     // TODO(marpan): Check settings for speed 7 and 8.
+    if (speed >= 7) {
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+      sf->mv_sf.use_bsize_dependent_search_method = 0;
+    }
+    if (speed >= 8) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+      sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->rt_sf.prune_hv_pred_modes_using_src_sad = false;
+    }
     if (speed >= 9) {
       sf->rt_sf.prune_idtx_nonrd = 1;
-      sf->rt_sf.part_early_exit_zeromv = 1;
+      sf->rt_sf.part_early_exit_zeromv = 2;
       sf->rt_sf.skip_lf_screen = 1;
-      sf->rt_sf.use_nonrd_filter_search = 0;
       sf->rt_sf.nonrd_prune_ref_frame_search = 3;
       sf->rt_sf.var_part_split_threshold_shift = 10;
       sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-      sf->rt_sf.force_half_pel_block = 1;
-      sf->rt_sf.reduce_zeromv_mvres = true;
+      sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+      sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+      sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+      sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+      sf->interp_sf.cb_pred_filter_search = 0;
     }
+    if (speed >= 10) {
+      if (cm->width * cm->height > 1920 * 1080)
+        sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+      sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+      sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
+      sf->rt_sf.part_early_exit_zeromv = 1;
+      sf->rt_sf.nonrd_aggressive_skip = 1;
+    }
+    sf->rt_sf.use_nonrd_altref_frame = 0;
     sf->rt_sf.skip_cdef_sb = 1;
     sf->rt_sf.use_rtc_tf = 0;
     sf->rt_sf.use_comp_ref_nonrd = 0;
-    sf->rt_sf.sad_based_comp_prune = 0;
     sf->rt_sf.source_metrics_sb_nonrd = 1;
     if (cpi->rc.high_source_sad == 1) {
       sf->rt_sf.prefer_large_partition_blocks = 0;
@@ -1411,14 +1489,15 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
           sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
       }
     }
-    if (cpi->rc.high_num_blocks_with_motion && speed >= 6) {
+    if (cpi->rc.max_block_source_sad > 20000 &&
+        cpi->rc.frame_source_sad > 100 &&
+        cpi->rc.percent_blocks_with_motion > 1 && speed >= 6) {
       sf->mv_sf.search_method = NSTEP;
       sf->rt_sf.fullpel_search_step_param = 2;
     }
     sf->rt_sf.partition_direct_merging = 0;
+    sf->hl_sf.accurate_bit_estimate = 0;
   }
-  if (cpi->svc.number_temporal_layers > 1 && cpi->svc.temporal_layer_id == 0)
-    sf->rt_sf.source_metrics_sb_nonrd = 0;
 }
 
 // TODO(kyslov): now this is very similar to
@@ -1615,7 +1694,6 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
 
     sf->mv_sf.search_method = FAST_DIAMOND;
     sf->mv_sf.subpel_force_stop = QUARTER_PEL;
-    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
 
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
     // This sf is not applicable in non-rd path.
@@ -1667,7 +1745,10 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
 
     sf->winner_mode_sf.dc_blk_pred_level = 0;
     sf->rt_sf.var_part_based_on_qidx = 3;
-    sf->rt_sf.prune_global_globalmv_with_zeromv = true;
+    sf->rt_sf.prune_compoundmode_with_singlecompound_var = true;
+    sf->rt_sf.prune_compoundmode_with_singlemode_var = true;
+    sf->rt_sf.skip_compound_based_on_var = true;
+    sf->rt_sf.use_adaptive_subpel_search = true;
   }
 
   if (speed >= 8) {
@@ -1679,16 +1760,16 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->rt_sf.nonrd_prune_ref_frame_search = 2;
     sf->rt_sf.nonrd_check_partition_merge_mode = 0;
     sf->rt_sf.var_part_split_threshold_shift = 8;
-    sf->interp_sf.cb_pred_filter_search = 1;
     sf->rt_sf.var_part_based_on_qidx = 4;
     sf->rt_sf.partition_direct_merging = 1;
+    sf->rt_sf.prune_compoundmode_with_singlemode_var = false;
+    sf->mv_sf.use_bsize_dependent_search_method = 2;
+    sf->rt_sf.prune_hv_pred_modes_using_src_sad = true;
   }
   if (speed >= 9) {
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3;
-    sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 20;
     sf->rt_sf.estimate_motion_for_var_based_partition = 0;
-    sf->rt_sf.prefer_large_partition_blocks = 4;
+    sf->rt_sf.prefer_large_partition_blocks = 3;
     sf->rt_sf.skip_intra_pred = 2;
     sf->rt_sf.var_part_split_threshold_shift = 9;
     for (int i = 0; i < BLOCK_SIZES; ++i)
@@ -1696,16 +1777,15 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->rt_sf.var_part_based_on_qidx = 0;
     sf->rt_sf.frame_level_mode_cost_update = true;
     sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+    sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+    sf->rt_sf.use_adaptive_subpel_search = true;
+    sf->mv_sf.use_bsize_dependent_search_method = 0;
   }
   if (speed >= 10) {
     sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
-    sf->rt_sf.nonrd_agressive_skip = 1;
     sf->rt_sf.nonrd_prune_ref_frame_search = 3;
     sf->rt_sf.var_part_split_threshold_shift = 10;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-    sf->rt_sf.force_half_pel_block = 1;
-    sf->rt_sf.reduce_zeromv_mvres = true;
-    sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
   }
 }
 
@@ -1720,6 +1800,7 @@ static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
   hl_sf->disable_extra_sc_testing = 0;
   hl_sf->second_alt_ref_filtering = 1;
   hl_sf->num_frames_used_in_tf = INT_MAX;
+  hl_sf->accurate_bit_estimate = 0;
 }
 
 static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
@@ -1994,7 +2075,6 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->use_comp_ref_nonrd = 0;
   rt_sf->use_real_time_ref_set = 0;
   rt_sf->short_circuit_low_temp_var = 0;
-  rt_sf->use_modeled_non_rd_cost = 0;
   rt_sf->reuse_inter_pred_nonrd = 0;
   rt_sf->num_inter_modes_for_tx_search = INT_MAX;
   rt_sf->use_nonrd_filter_search = 0;
@@ -2009,7 +2089,8 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->fullpel_search_step_param = 0;
   for (int i = 0; i < BLOCK_SIZES; ++i)
     rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL;
-  rt_sf->nonrd_agressive_skip = 0;
+  rt_sf->prune_hv_pred_modes_using_src_sad = false;
+  rt_sf->nonrd_aggressive_skip = 0;
   rt_sf->skip_cdef_sb = 0;
   rt_sf->force_large_partition_blocks_intra = 0;
   rt_sf->skip_tx_no_split_var_based_partition = 0;
@@ -2018,7 +2099,8 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->prune_inter_modes_with_golden_ref = 0;
   rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
   rt_sf->prune_inter_modes_using_temp_var = 0;
-  rt_sf->force_half_pel_block = 0;
+  rt_sf->reduce_mv_pel_precision_highmotion = 0;
+  rt_sf->reduce_mv_pel_precision_lowcomplex = 0;
   rt_sf->prune_intra_mode_based_on_mv_range = 0;
   rt_sf->var_part_split_threshold_shift = 7;
   rt_sf->gf_refresh_based_on_qp = 0;
@@ -2030,14 +2112,45 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->sad_based_adp_altref_lag = 0;
   rt_sf->partition_direct_merging = 0;
   rt_sf->var_part_based_on_qidx = 0;
-  rt_sf->sad_based_comp_prune = 0;
   rt_sf->tx_size_level_based_on_qstep = 0;
-  rt_sf->reduce_zeromv_mvres = false;
   rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
-  rt_sf->prune_global_globalmv_with_zeromv = false;
+  rt_sf->prune_compoundmode_with_singlecompound_var = false;
   rt_sf->frame_level_mode_cost_update = false;
+  rt_sf->prune_h_pred_using_best_mode_so_far = false;
   rt_sf->check_only_zero_zeromv_on_large_blocks = false;
   rt_sf->disable_cdf_update_non_reference_frame = false;
+  rt_sf->prune_compoundmode_with_singlemode_var = false;
+  rt_sf->skip_compound_based_on_var = false;
+  rt_sf->set_zeromv_skip_based_on_source_sad = 1;
+  rt_sf->use_adaptive_subpel_search = false;
+  rt_sf->screen_content_cdef_filter_qindex_thresh = 0;
+  rt_sf->enable_ref_short_signaling = false;
+  rt_sf->check_globalmv_on_single_ref = true;
+}
+
+// Populate appropriate sub-pel search method based on speed feature and user
+// specified settings
+static void set_subpel_search_method(
+    MotionVectorSearchParams *mv_search_params,
+    unsigned int motion_vector_unit_test,
+    SUBPEL_SEARCH_METHODS subpel_search_method) {
+  if (subpel_search_method == SUBPEL_TREE) {
+    mv_search_params->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+  } else if (subpel_search_method == SUBPEL_TREE_PRUNED) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned;
+  } else if (subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned_more;
+  } else {
+    assert(0);
+  }
+
+  // This is only used in motion vector unit test.
+  if (motion_vector_unit_test == 1)
+    mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (motion_vector_unit_test == 2)
+    mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
@@ -2063,11 +2176,9 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
         (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
-  // This is only used in motion vector unit test.
-  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 1)
-    cpi->mv_search_params.find_fractional_mv_step = av1_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
-    cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+  set_subpel_search_method(&cpi->mv_search_params,
+                           cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+                           sf->mv_sf.subpel_search_method);
 
   // For multi-thread use case with row_mt enabled, cost update for a set of
   // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to
@@ -2132,13 +2243,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
         (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
-  // sf->part_sf.partition_search_breakout_dist_thr is set assuming max 64x64
-  // blocks. Normalise this if the blocks are bigger.
-  if (MAX_SB_SIZE_LOG2 > 6) {
-    sf->part_sf.partition_search_breakout_dist_thr <<=
-        2 * (MAX_SB_SIZE_LOG2 - 6);
-  }
-
   const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
   for (i = 0; i < MAX_MESH_STEP; ++i) {
     sf->mv_sf.mesh_patterns[i].range =
@@ -2166,22 +2270,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi))
     sf->hl_sf.recode_loop = DISALLOW_RECODE;
 
-  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
-  if (sf->mv_sf.subpel_search_method == SUBPEL_TREE) {
-    mv_search_params->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
-  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED) {
-    mv_search_params->find_fractional_mv_step =
-        av1_find_best_sub_pixel_tree_pruned;
-  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
-    mv_search_params->find_fractional_mv_step =
-        av1_find_best_sub_pixel_tree_pruned_more;
-  }
-
-  // This is only used in motion vector unit test.
-  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 1)
-    mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
-    mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+  set_subpel_search_method(&cpi->mv_search_params,
+                           cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+                           sf->mv_sf.subpel_search_method);
 
   // assert ensures that tx_domain_dist_level is accessed correctly
   assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
@@ -2377,4 +2468,8 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
         sf->inter_sf.reuse_mask_search_results = 1;
     }
   }
+
+  set_subpel_search_method(&cpi->mv_search_params,
+                           cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+                           sf->mv_sf.subpel_search_method);
 }
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index c55202d4d..00c21e5e2 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -435,6 +435,13 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES {
    * levels and arf-q.
    */
   int num_frames_used_in_tf;
+
+  /*!
+   * Decide the bit estimation approach used in qindex decision.
+   * 0: estimate bits based on a constant value;
+   * 1: estimate bits more accurately based on the frame complexity.
+   */
+  int accurate_bit_estimate;
 } HIGH_LEVEL_SPEED_FEATURES;
 
 /*!
@@ -725,8 +732,8 @@ typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
 
-  // Enable the use of faster, less accurate mv search method on bsize >=
-  // BLOCK_32X32.
+  // Enable the use of faster, less accurate mv search method
+  // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp
   // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into
   // account.
   int use_bsize_dependent_search_method;
@@ -920,6 +927,14 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   int prune_comp_using_best_single_mode_ref;
 
   // Skip NEARESTMV and NEARMV using weight computed in ref mv list population
+  // This speed feature sometimes leads to severe visual artifacts for
+  // the overlay frame. It makes inter RD mode search skip NEARESTMV
+  // and NEARMV, and no valid inter mode is evaluated when the NEWMV mode
+  // is also early terminated due to the constraint that it does not handle
+  // zero mv difference. In this cases, intra modes will be chosen, leading
+  // to bad prediction and flickering artifacts.
+  // Turn off this feature for now. Be careful to check visual quality if
+  // anyone is going to turn it on.
   int prune_nearest_near_mv_using_refmv_weight;
 
   // Based on previous ref_mv_idx search result, prune the following search.
@@ -1038,6 +1053,7 @@ typedef struct INTERP_FILTER_SPEED_FEATURES {
   // dual_filter=0 case
   int skip_sharp_interp_filter_search;
 
+  // skip interpolation filter search for a block in chessboard pattern
   int cb_pred_filter_search;
 
   // adaptive interp_filter search to allow skip of certain filter types.
@@ -1423,9 +1439,6 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // temporal variance.
   int short_circuit_low_temp_var;
 
-  // Use modeled (currently CurvFit model) RDCost for fast non-RD mode
-  int use_modeled_non_rd_cost;
-
   // Reuse inter prediction in fast non-rd mode.
   int reuse_inter_pred_nonrd;
 
@@ -1459,7 +1472,7 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   int check_scene_detection;
 
   // For nonrd mode: Prefer larger partition blks in variance based partitioning
-  // 0: disabled, 1-4: increasing aggressiveness
+  // 0: disabled, 1-3: increasing aggressiveness
   int prefer_large_partition_blocks;
 
   // uses results of temporal noise estimate
@@ -1472,11 +1485,22 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   int fullpel_search_step_param;
 
   // Bit mask to enable or disable intra modes for each prediction block size
-  // separately, for nonrd pickmode.
+  // separately, for nonrd_pickmode.  Currently, the sf is not respected when
+  // 'force_intra_check' is true in 'estimate_intra_mode()' function. Also, H
+  // and V pred modes allowed through this sf can be further pruned when
+  //'prune_hv_pred_modes_using_src_sad' sf is true.
   int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES];
 
-  // Skips mode checks more agressively in nonRD mode
-  int nonrd_agressive_skip;
+  // Prune H and V intra predition modes evalution in inter frame.
+  // The sf does not have any impact.
+  // i. when frame_source_sad is 1.1 times greater than avg_source_sad
+  // ii. when cyclic_refresh_segment_id_boosted is enabled
+  // iii. when SB level source sad is greater than kMedSad
+  // iv. when color sensitivity is non zero for both the chroma channels
+  bool prune_hv_pred_modes_using_src_sad;
+
+  // Skips mode checks more aggressively in nonRD mode
+  int nonrd_aggressive_skip;
 
   // Skip cdef on 64x64 blocks when NEWMV or INTRA is not picked or color
   // sensitivity is off. When color sensitivity is on for a superblock, all
@@ -1508,8 +1532,19 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // variance wrt LAST reference.
   int prune_inter_modes_using_temp_var;
 
-  // Force half_pel at block level.
-  int force_half_pel_block;
+  // Reduce MV precision to halfpel for higher int MV value & frame-level motion
+  // 0: disabled
+  // 1-2: Reduce precision to halfpel, fullpel based on conservative
+  // thresholds, aggressiveness increases with increase in level
+  // 3: Reduce precision to halfpel using more aggressive thresholds
+  int reduce_mv_pel_precision_highmotion;
+
+  // Reduce MV precision for low complexity blocks
+  // 0: disabled
+  // 1: Reduce the mv resolution for zero mv if the variance is low
+  // 2: Switch to halfpel, fullpel based on low block spatial-temporal
+  // complexity.
+  int reduce_mv_pel_precision_lowcomplex;
 
   // Prune intra mode evaluation in inter frames based on mv range.
   BLOCK_SIZE prune_intra_mode_based_on_mv_range;
@@ -1529,6 +1564,7 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   int gf_refresh_based_on_qp;
 
   // Temporal filtering
+  // The value can be 1 or 2, which indicates the threshold to use.
   int use_rtc_tf;
 
   // Prune the use of the identity transform in nonrd_pickmode,
@@ -1543,6 +1579,9 @@ typedef struct REAL_TIME_SPEED_FEATURES {
 
   // For nonrd: early exit out of variance partition that sets the
   // block size to superblock size, and sets mode to zeromv-last skip.
+  // 0: disabled
+  // 1: zeromv-skip is enabled at SB level only
+  // 2: zeromv-skip is enabled at SB level and coding block level
   int part_early_exit_zeromv;
 
   // Early terminate inter mode search based on sse in non-rd path.
@@ -1554,15 +1593,9 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // Enable/disable partition direct merging.
   int partition_direct_merging;
 
-  // SAD based compound mode pruning
-  int sad_based_comp_prune;
-
   // Level of aggressiveness for obtaining tx size based on qstep
   int tx_size_level_based_on_qstep;
 
-  // Reduce the mv resolution for zero mv if the variance is low.
-  bool reduce_zeromv_mvres;
-
   // Avoid the partitioning of a 16x16 block in variance based partitioning
   // (VBP) by making use of minimum and maximum sub-block variances.
   // For allintra encode, this speed feature reduces instruction count by 5.39%
@@ -1573,22 +1606,40 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // gain of 0.78%.
   bool vbp_prune_16x16_split_using_min_max_sub_blk_var;
 
-  // A qindex threshold that determines whether to use qindex based
-  // CDEF filter strength estimation for screen content types.
-  // This speed feature has a substantial gain on coding metrics,
-  // with moderate increased encoding time.
-  // Set to zero to turn off this speed feature.
+  // A qindex threshold that determines whether to use qindex based CDEF filter
+  // strength estimation for screen content types. The strength estimation model
+  // used for screen contents prefers to allow cdef filtering for more frames.
+  // This sf is used to limit the frames which go through cdef filtering and
+  // following explains the setting of the same.
+  // MAXQ (255): This disables the usage of this sf. Here, frame does not use a
+  // screen content model thus reduces the number of frames that go through cdef
+  // filtering.
+  // MINQ (0): Frames always use screen content model thus increasing the number
+  // of frames that go through cdef filtering.
+  // This speed feature has a substantial gain on coding metrics, with moderate
+  // increase encoding time. Select threshold based on speed vs quality
+  // trade-off.
   int screen_content_cdef_filter_qindex_thresh;
 
-  // Prunes global_globalmv search if its variance is \gt the globalmv's
-  // variance.
-  bool prune_global_globalmv_with_zeromv;
+  // Prune compound mode if its variance is higher than the variance of single
+  // modes.
+  bool prune_compoundmode_with_singlecompound_var;
 
   // Allow mode cost update at frame level every couple frames. This
   // overrides the command line setting --mode-cost-upd-freq=3 (never update
   // except on key frame and first delta).
   bool frame_level_mode_cost_update;
 
+  // Prune H_PRED during intra mode evaluation in the nonrd path based on best
+  // mode so far.
+  //
+  // For allintra encode, this speed feature reduces instruction count by 1.10%
+  // for speed 9 with coding performance change less than 0.04%.
+  // For AVIF image encode, this speed feature reduces encode time by 1.03% for
+  // speed 9 on a typical image dataset with coding performance change less than
+  // 0.08%.
+  bool prune_h_pred_using_best_mode_so_far;
+
   // If compound is enabled, and the current block size is \geq BLOCK_16X16,
   // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the
   // base layer of svc.
@@ -1596,6 +1647,35 @@ typedef struct REAL_TIME_SPEED_FEATURES {
 
   // Allow for disabling cdf update for non reference frames in svc mode.
   bool disable_cdf_update_non_reference_frame;
+
+  // Prune compound modes if the single modes variances do not perform well.
+  bool prune_compoundmode_with_singlemode_var;
+
+  // Skip searching all compound mode if the variance of single_mode residue is
+  // sufficiently low.
+  bool skip_compound_based_on_var;
+
+  // Sets force_zeromv_skip based on the source sad available. Aggressiveness
+  // increases with increase in the level set for speed feature.
+  // 0: No setting
+  // 1: If source sad is kZeroSad
+  // 2: If source sad <= kVeryLowSad
+  int set_zeromv_skip_based_on_source_sad;
+
+  // Downgrades the block-level subpel motion search to
+  // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel
+  // search performed well, zeromv has low sad or low source_var
+  bool use_adaptive_subpel_search;
+
+  // A flag used in RTC case to control frame_refs_short_signaling. Note that
+  // the final decision is made in check_frame_refs_short_signaling(). The flag
+  // can only be turned on when res < 360p and speed >= 9, in which case only
+  // LAST and GOLDEN ref frames are used now.
+  bool enable_ref_short_signaling;
+
+  // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame
+  // case.
+  bool check_globalmv_on_single_ref;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
@@ -1686,7 +1766,7 @@ struct AV1_COMP;
  * \param[in]    cpi     Top - level encoder instance structure
  * \param[in]    speed   Speed setting passed in from the command  line
  *
- * \return No return value but configures the various speed trade off flags
+ * \remark No return value but configures the various speed trade off flags
  *         based on the passed in speed setting. (Higher speed gives lower
  *         quality)
  */
@@ -1700,7 +1780,7 @@ void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
  * \param[in]    cpi     Top - level encoder instance structure
  * \param[in]    speed   Speed setting passed in from the command  line
  *
- * \return No return value but configures the various speed trade off flags
+ * \remark No return value but configures the various speed trade off flags
  *         based on the passed in speed setting and frame size. (Higher speed
  *         corresponds to lower quality)
  */
@@ -1713,7 +1793,7 @@ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
  * \param[in]    cpi     Top - level encoder instance structure
  * \param[in]    speed   Speed setting passed in from the command  line
  *
- * \return No return value but configures the various speed trade off flags
+ * \remark No return value but configures the various speed trade off flags
  *         based on the passed in speed setting and current frame's Q index.
  *         (Higher speed corresponds to lower quality)
  */
diff --git a/av1/encoder/superres_scale.c b/av1/encoder/superres_scale.c
index 283faabe6..f439e700d 100644
--- a/av1/encoder/superres_scale.c
+++ b/av1/encoder/superres_scale.c
@@ -399,8 +399,6 @@ void av1_setup_frame_size(AV1_COMP *cpi) {
 void av1_superres_post_encode(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
 
-  if (!av1_superres_scaled(cm)) return;
-
   assert(cpi->oxcf.superres_cfg.enable_superres);
   assert(!is_lossless_requested(&cpi->oxcf.rc_cfg));
   assert(!cm->features.all_lossless);
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index d5f53530f..d31f55d29 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -79,9 +79,19 @@ void av1_init_layer_context(AV1_COMP *const cpi) {
   if (svc->number_spatial_layers == 3) {
     svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
   }
-  svc->ref_frame_comp[0] = 0;
-  svc->ref_frame_comp[1] = 0;
-  svc->ref_frame_comp[2] = 0;
+}
+
+bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) {
+  SVC *const svc = &cpi->svc;
+  if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) {
+    aom_free(svc->layer_context);
+    svc->num_allocated_layers = 0;
+    svc->layer_context =
+        (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context));
+    if (svc->layer_context == NULL) return false;
+    svc->num_allocated_layers = num_layers;
+  }
+  return true;
 }
 
 // Update the layer context from a change_config() call.
@@ -166,14 +176,15 @@ void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
 }
 
 static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame(
-    int ref_frame, const SVC *svc) {
-  int ref_frame_idx = svc->ref_idx[ref_frame - 1];
+    int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) {
+  int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1];
   return svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
          svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1;
 }
 
 void av1_restore_layer_context(AV1_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
   const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
@@ -199,6 +210,7 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
     cr->sb_index = lc->sb_index;
     cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
     cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+    cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
   }
   svc->skip_mvsearch_last = 0;
   svc->skip_mvsearch_gf = 0;
@@ -207,14 +219,14 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   // This is to skip searching mv for that reference if it was last
   // refreshed (i.e., buffer slot holding that reference was refreshed) on the
   // previous spatial layer(s) at the same time (current_superframe).
-  if (svc->set_ref_frame_config && svc->force_zero_mode_spatial_ref) {
-    if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc)) {
+  if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref) {
+    if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) {
       svc->skip_mvsearch_last = 1;
     }
-    if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc)) {
+    if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) {
       svc->skip_mvsearch_gf = 1;
     }
-    if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc)) {
+    if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) {
       svc->skip_mvsearch_altref = 1;
     }
   }
@@ -241,6 +253,7 @@ void av1_save_layer_context(AV1_COMP *const cpi) {
     lc->sb_index = cr->sb_index;
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+    lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
   }
   // For any buffer slot that is refreshed, update it with
   // the spatial_layer_id and the current_superframe.
@@ -250,10 +263,10 @@ void av1_save_layer_context(AV1_COMP *const cpi) {
       svc->buffer_time_index[i] = svc->current_superframe;
       svc->buffer_spatial_layer[i] = svc->spatial_layer_id;
     }
-  } else if (cpi->svc.set_ref_frame_config) {
+  } else if (cpi->ppi->rtc_ref.set_ref_frame_config) {
     for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
-      int ref_frame_map_idx = svc->ref_idx[i];
-      if (cpi->svc.refresh[ref_frame_map_idx]) {
+      int ref_frame_map_idx = cpi->ppi->rtc_ref.ref_idx[i];
+      if (cpi->ppi->rtc_ref.refresh[ref_frame_map_idx]) {
         svc->buffer_time_index[ref_frame_map_idx] = svc->current_superframe;
         svc->buffer_spatial_layer[ref_frame_map_idx] = svc->spatial_layer_id;
       }
@@ -275,16 +288,29 @@ int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   int fb_idx = -1;
   int primary_ref_frame = PRIMARY_REF_NONE;
-  // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST
-  // was last updated on a lower temporal layer (or base TL0) and for the
-  // same spatial layer. For RTC patterns this allows for continued decoding
-  // when set of enhancement layers are dropped (continued decoding starting
-  // at next base TL0), so error_resilience can be off/0 for all layers.
-  fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
-  if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id &&
-      (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id ||
-       svc->temporal_layer_fb[fb_idx] == 0)) {
-    primary_ref_frame = 0;  // LAST_FRAME
+  if (cpi->svc.number_spatial_layers > 1 ||
+      cpi->svc.number_temporal_layers > 1) {
+    // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST
+    // was last updated on a lower temporal layer (or base TL0) and for the
+    // same spatial layer. For RTC patterns this allows for continued decoding
+    // when set of enhancement layers are dropped (continued decoding starting
+    // at next base TL0), so error_resilience can be off/0 for all layers.
+    fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+    if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id &&
+        (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id ||
+         svc->temporal_layer_fb[fb_idx] == 0)) {
+      primary_ref_frame = 0;  // LAST_FRAME: ref_frame - LAST_FRAME
+    }
+  } else if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+    const ExternalFlags *const ext_flags = &cpi->ext_flags;
+    int flags = ext_flags->ref_frame_flags;
+    if (flags & AOM_LAST_FLAG) {
+      primary_ref_frame = 0;  // LAST_FRAME: ref_frame - LAST_FRAME
+    } else if (flags & AOM_GOLD_FLAG) {
+      primary_ref_frame = GOLDEN_FRAME - LAST_FRAME;
+    } else if (flags & AOM_ALT_FLAG) {
+      primary_ref_frame = ALTREF_FRAME - LAST_FRAME;
+    }
   }
   return primary_ref_frame;
 }
@@ -344,7 +370,10 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
   cpi->common.height = height;
   alloc_mb_mode_info_buffers(cpi);
   av1_update_frame_size(cpi);
-  if (svc->spatial_layer_id == 0) svc->high_source_sad_superframe = 0;
+  if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+    svc->mi_cols_full_resoln = cpi->common.mi_params.mi_cols;
+    svc->mi_rows_full_resoln = cpi->common.mi_params.mi_rows;
+  }
 }
 
 enum {
@@ -361,42 +390,43 @@ enum {
 // spatial and temporal layers, and the ksvc_fixed_mode.
 void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
+  RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
   int i;
   assert(svc->use_flexible_mode == 0);
   // Fixed SVC mode only supports at most 3 spatial or temporal layers.
   assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 &&
          svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
-  svc->set_ref_frame_config = 1;
+  rtc_ref->set_ref_frame_config = 1;
   int superframe_cnt = svc->current_superframe;
   // Set the reference map buffer idx for the 7 references:
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
-  for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = i;
-  for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->reference[i] = 0;
-  for (i = 0; i < REF_FRAMES; i++) svc->refresh[i] = 0;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->reference[i] = 0;
+  for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0;
   // Always reference LAST, and reference GOLDEN on SL > 0.
   // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
   // when frame_type is set.
-  svc->reference[SVC_LAST_FRAME] = 1;
-  if (svc->spatial_layer_id > 0) svc->reference[SVC_GOLDEN_FRAME] = 1;
+  rtc_ref->reference[SVC_LAST_FRAME] = 1;
+  if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1;
   if (svc->temporal_layer_id == 0) {
     // Base temporal layer.
     if (svc->spatial_layer_id == 0) {
       // Set all buffer_idx to 0. Update slot 0 (LAST).
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
-      svc->refresh[0] = 1;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->refresh[0] = 1;
     } else if (svc->spatial_layer_id == 1) {
       // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to
       // slot 0. Update slot 1 (LAST).
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
-      svc->ref_idx[SVC_LAST_FRAME] = 1;
-      svc->refresh[1] = 1;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+      rtc_ref->refresh[1] = 1;
     } else if (svc->spatial_layer_id == 2) {
       // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to
       // slot 1. Update slot 2 (LAST).
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 1;
-      svc->ref_idx[SVC_LAST_FRAME] = 2;
-      svc->refresh[2] = 1;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+      rtc_ref->refresh[2] = 1;
     }
   } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) {
     // First top temporal enhancement layer.
@@ -404,27 +434,27 @@ void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
       // Reference LAST (slot 0).
       // Set GOLDEN to slot 3 and update slot 3.
       // Set all other buffer_idx to slot 0.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
       if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
-        svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
-        svc->refresh[3] = 1;
+        rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        rtc_ref->refresh[3] = 1;
       }
     } else if (svc->spatial_layer_id == 1) {
       // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
       // GOLDEN (and all other refs) to slot 3.
       // Set LAST2 to slot 4 and Update slot 4.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 3;
-      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
       if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
-        svc->ref_idx[SVC_LAST2_FRAME] = 4;
-        svc->refresh[4] = 1;
+        rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+        rtc_ref->refresh[4] = 1;
       }
     } else if (svc->spatial_layer_id == 2) {
       // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
       // GOLDEN (and all other refs) to slot 4.
       // No update.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 4;
-      svc->ref_idx[SVC_LAST_FRAME] = 2;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
     }
   } else if (svc->temporal_layer_id == 1) {
     // Middle temporal enhancement layer.
@@ -432,30 +462,30 @@ void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
       // Reference LAST.
       // Set all buffer_idx to 0.
       // Set GOLDEN to slot 5 and update slot 5.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
       if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
-        svc->ref_idx[SVC_GOLDEN_FRAME] = 5;
-        svc->refresh[5] = 1;
+        rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5;
+        rtc_ref->refresh[5] = 1;
       }
     } else if (svc->spatial_layer_id == 1) {
       // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
       // GOLDEN (and all other refs) to slot 5.
       // Set LAST3 to slot 6 and update slot 6.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 5;
-      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
       if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
-        svc->ref_idx[SVC_LAST3_FRAME] = 6;
-        svc->refresh[6] = 1;
+        rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6;
+        rtc_ref->refresh[6] = 1;
       }
     } else if (svc->spatial_layer_id == 2) {
       // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
       // GOLDEN (and all other refs) to slot 6.
       // Set LAST3 to slot 7 and update slot 7.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 6;
-      svc->ref_idx[SVC_LAST_FRAME] = 2;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
       if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
-        svc->ref_idx[SVC_LAST3_FRAME] = 7;
-        svc->refresh[7] = 1;
+        rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7;
+        rtc_ref->refresh[7] = 1;
       }
     }
   } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) {
@@ -464,28 +494,28 @@ void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
       // Set LAST to slot 5 and reference LAST.
       // Set GOLDEN to slot 3 and update slot 3.
       // Set all other buffer_idx to 0.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
-      svc->ref_idx[SVC_LAST_FRAME] = 5;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 5;
       if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
-        svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
-        svc->refresh[3] = 1;
+        rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        rtc_ref->refresh[3] = 1;
       }
     } else if (svc->spatial_layer_id == 1) {
       // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
       // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
-      svc->ref_idx[SVC_LAST_FRAME] = 6;
-      svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 6;
+      rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
       if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
-        svc->ref_idx[SVC_LAST2_FRAME] = 4;
-        svc->refresh[4] = 1;
+        rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+        rtc_ref->refresh[4] = 1;
       }
     } else if (svc->spatial_layer_id == 2) {
       // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
       // GOLDEN to slot 4. No update.
-      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
-      svc->ref_idx[SVC_LAST_FRAME] = 7;
-      svc->ref_idx[SVC_GOLDEN_FRAME] = 4;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+      rtc_ref->ref_idx[SVC_LAST_FRAME] = 7;
+      rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4;
     }
   }
 }
@@ -515,3 +545,28 @@ void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
     }
   }
 }
+
+void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input,
+                             YV12_BUFFER_CONFIG *prev_source) {
+  if (cpi->svc.spatial_layer_id == 0) {
+    // For base spatial layer: if the LAST reference (index 0) is not
+    // the previous (super)frame set the last_source to the source corresponding
+    // to the last TL0, otherwise keep it at prev_source.
+    frame_input->last_source = prev_source != NULL ? prev_source : NULL;
+    if (cpi->svc.current_superframe > 0) {
+      const int buffslot_last = cpi->ppi->rtc_ref.ref_idx[0];
+      if (cpi->svc.buffer_time_index[buffslot_last] <
+          cpi->svc.current_superframe - 1)
+        frame_input->last_source = &cpi->svc.source_last_TL0;
+    }
+  } else if (cpi->svc.spatial_layer_id > 0) {
+    // For spatial enhancement layers: the previous source (prev_source)
+    // corresponds to the lower spatial layer (which is the same source so
+    // we can't use that), so always set the last_source to the source of the
+    // last TL0.
+    if (cpi->svc.current_superframe > 0)
+      frame_input->last_source = &cpi->svc.source_last_TL0;
+    else
+      frame_input->last_source = NULL;
+  }
+}
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index e3c2653b3..5e983f648 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -11,6 +11,7 @@
 #ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
 #define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
 
+#include "aom_scale/yv12config.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/ratectrl.h"
@@ -90,22 +91,11 @@ typedef struct SVC {
   int temporal_layer_id;
   int number_spatial_layers;
   int number_temporal_layers;
-  int set_ref_frame_config;
-  int non_reference_frame;
   int use_flexible_mode;
   int ksvc_fixed_mode;
-  int ref_frame_comp[3];
   /*!\endcond */
 
-  /*!
-   * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
-   * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
-   */
-  int reference[INTER_REFS_PER_FRAME];
   /*!\cond */
-  int ref_idx[INTER_REFS_PER_FRAME];
-  int refresh[REF_FRAMES];
-  int gld_idx_1layer;
   double base_framerate;
   unsigned int current_superframe;
   unsigned int buffer_time_index[REF_FRAMES];
@@ -117,13 +107,20 @@ typedef struct SVC {
   int temporal_layer_fb[REF_FRAMES];
   int num_encoded_top_layer;
   int first_layer_denoise;
-  int high_source_sad_superframe;
+  YV12_BUFFER_CONFIG source_last_TL0;
+  int mi_cols_full_resoln;
+  int mi_rows_full_resoln;
   /*!\endcond */
 
   /*!
    * Layer context used for rate control in CBR mode.
    */
-  LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+  LAYER_CONTEXT *layer_context;
+
+  /*!
+   * Number of layers allocated for layer_context.
+   */
+  int num_allocated_layers;
 
   /*!
    * EIGHTTAP_SMOOTH or BILINEAR
@@ -143,6 +140,7 @@ typedef struct SVC {
 } SVC;
 
 struct AV1_COMP;
+struct EncodeFrameInput;
 
 /*!\brief Initialize layer context data from init_config().
  *
@@ -152,10 +150,24 @@ struct AV1_COMP;
  *
  * \param[in]       cpi  Top level encoder structure
  *
- * \return  Nothing returned. Set cpi->svc.
+ * \remark  Nothing returned. Set cpi->svc.
  */
 void av1_init_layer_context(struct AV1_COMP *const cpi);
 
+/*!\brief Allocate layer context data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       num_layers  Number of layers to be allocated
+ *
+ * \remark  Allocates memory for cpi->svc.layer_context.
+ * \return  True on success, false on allocation failure.
+ */
+bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers);
+
 /*!\brief Update the layer context from a change_config() call.
  *
  * \ingroup SVC
@@ -165,7 +177,7 @@ void av1_init_layer_context(struct AV1_COMP *const cpi);
  * \param[in]       cpi  Top level encoder structure
  * \param[in]       target_bandwidth  Total target bandwidth
  *
- * \return  Nothing returned. Buffer level for each layer is set.
+ * \remark  Nothing returned. Buffer level for each layer is set.
  */
 void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
                                             const int64_t target_bandwidth);
@@ -179,7 +191,7 @@ void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
  *
  * \param[in]       cpi  Top level encoder structure
  *
- * \return  Nothing returned. Frame related quantities for current temporal
+ * \remark  Nothing returned. Frame related quantities for current temporal
  layer are updated.
  */
 void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
@@ -193,7 +205,7 @@ void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
  *
  * \param[in]       cpi  Top level encoder structure
  *
- * \return  Nothing returned. Layer context for current layer is set.
+ * \remark  Nothing returned. Layer context for current layer is set.
  */
 void av1_restore_layer_context(struct AV1_COMP *const cpi);
 
@@ -204,8 +216,6 @@ void av1_restore_layer_context(struct AV1_COMP *const cpi);
  * \callergraph
  *
  * \param[in]       cpi  Top level encoder structure
- *
- * \return  Nothing returned.
  */
 void av1_save_layer_context(struct AV1_COMP *const cpi);
 
@@ -216,8 +226,6 @@ void av1_save_layer_context(struct AV1_COMP *const cpi);
  * \callergraph
  *
  * \param[in]       cpi  Top level encoder structure
- *
- * \return  Nothing returned.
  */
 void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
 
@@ -229,8 +237,6 @@ void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
  *
  * \param[in]       cpi  Top level encoder structure
  * \param[in]       is_key  Whether current layer is key frame
- *
- * \return  Nothing returned.
  */
 void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
 
@@ -241,8 +247,6 @@ void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
  * \callergraph
  *
  * \param[in]       cpi  Top level encoder structure
- *
- * \return  Nothing returned.
  */
 void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
 
@@ -268,7 +272,7 @@ int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
  * \param[in]       width_out    Output width, scaled for current layer
  * \param[in]       height_out   Output height, scaled for current layer
  *
- * \return Nothing is returned. Instead the scaled width and height are set.
+ * \remark Nothing is returned. Instead the scaled width and height are set.
  */
 void av1_get_layer_resolution(const int width_org, const int height_org,
                               const int num, const int den, int *width_out,
@@ -278,6 +282,10 @@ void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
 
 void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
 
+void av1_svc_set_last_source(struct AV1_COMP *const cpi,
+                             struct EncodeFrameInput *frame_input,
+                             YV12_BUFFER_CONFIG *prev_source);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index cc7b49f92..4845c5dfd 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -81,7 +81,7 @@ static void tf_determine_block_partition(const MV block_mv, const int block_mse,
  * \param[out]  subblock_mses   Pointer to the search errors (MSE) for 4
  *                              sub-blocks
  *
- * \return Nothing will be returned. Results are saved in subblock_mvs and
+ * \remark Nothing will be returned. Results are saved in subblock_mvs and
  *         subblock_mses
  */
 static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
@@ -109,9 +109,6 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
   // Parameters used for motion search.
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   SUBPEL_MOTION_SEARCH_PARAMS ms_params;
-  const SEARCH_METHODS search_method = NSTEP;
-  const search_site_config *search_site_cfg = av1_get_search_site_config(
-      mb->search_site_cfg_buf, &cpi->mv_search_params, search_method, y_stride);
   const int step_param = av1_init_search_range(
       AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
   const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
@@ -131,6 +128,11 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
   mb->plane[0].src.stride = y_stride;
   mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
   mbd->plane[0].pre[0].stride = y_stride;
+
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      av1_get_search_site_config(cpi, mb, search_method);
+
   // Unused intermediate results for motion search.
   unsigned int sse, error;
   int distortion;
@@ -187,7 +189,7 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
     block_mv = best_mv.as_mv;
     *ref_mv = best_mv.as_mv;
     // On 4 sub-blocks.
-    const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
+    const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
     const int subblock_height = block_size_high[subblock_size];
     const int subblock_width = block_size_wide[subblock_size];
     const int subblock_pels = subblock_height * subblock_width;
@@ -323,7 +325,7 @@ static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
  *                             order)
  * \param[out]  pred           Pointer to the predictor to be built
  *
- * \return Nothing returned, But the contents of `pred` will be modified
+ * \remark Nothing returned, But the contents of `pred` will be modified
  */
 static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
                                const MACROBLOCKD *mbd,
@@ -551,7 +553,7 @@ void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
  * \param[out]  count           Pointer to the pixel-wise counter for
  *                              filtering
  *
- * \return Nothing returned, But the contents of `accum`, `pred` and 'count'
+ * \remark Nothing returned, But the contents of `accum`, `pred` and 'count'
  *         will be modified
  */
 void av1_apply_temporal_filter_c(
@@ -734,7 +736,7 @@ void av1_highbd_apply_temporal_filter_c(
  * \param[in]   count          Pointer to the pre-computed count
  * \param[out]  result_buffer  Pointer to result buffer
  *
- * \return Nothing returned, but the content to which `result_buffer` pointer
+ * \remark Nothing returned, but the content to which `result_buffer` pointer
  *         will be modified
  */
 static void tf_normalize_filtered_frame(
@@ -914,7 +916,7 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
  * \ingroup src_frame_proc
  * \param[in]   cpi                   Top level encoder instance structure
  *
- * \return Nothing will be returned, but the contents of td->diff will be
+ * \remark Nothing will be returned, but the contents of td->diff will be
  modified.
  */
 static void tf_do_filtering(AV1_COMP *cpi) {
@@ -949,7 +951,7 @@ static void tf_do_filtering(AV1_COMP *cpi) {
  *                              in the lookahead buffer cpi->lookahead
  * \param[in]   gf_frame_index  GOP index
  *
- * \return Nothing will be returned. But the fields `frames`, `num_frames`,
+ * \remark Nothing will be returned. But the fields `frames`, `num_frames`,
  *         `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
  */
 static void tf_setup_filtering_buffer(AV1_COMP *cpi,
@@ -1040,7 +1042,7 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
   num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
 
   if (frame_type == KEY_FRAME) {
-    num_before = is_forward_keyframe ? num_frames / 2 : 0;
+    num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before);
     num_after = AOMMIN(num_frames - 1, max_after);
   } else {
     int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame,
@@ -1295,17 +1297,6 @@ void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) {
                          "Failed to allocate tf_info");
     }
   }
-
-  ret = aom_realloc_frame_buffer(
-      &tf_info->tf_buf_second_arf, oxcf->frm_dim_cfg.width,
-      oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
-      seq_params->subsampling_y, seq_params->use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL,
-      cpi->oxcf.tool_cfg.enable_global_motion, 0);
-  if (ret) {
-    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate tf_info");
-  }
 }
 
 void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 633dbe462..725bd869d 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -287,7 +287,7 @@ double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
 * \param[in]   mb_row                Macroblock row to be filtered
 filtering
 *
-* \return Nothing will be returned, but the contents of td->diff will be
+* \remark Nothing will be returned, but the contents of td->diff will be
 modified.
 */
 void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td,
diff --git a/av1/encoder/thirdpass.c b/av1/encoder/thirdpass.c
index d5265540d..a25522fbc 100644
--- a/av1/encoder/thirdpass.c
+++ b/av1/encoder/thirdpass.c
@@ -8,7 +8,9 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include "av1/encoder/thirdpass.h"
 
+#if CONFIG_THREE_PASS && CONFIG_AV1_DECODER
 #include "aom/aom_codec.h"
 #include "aom/aomdx.h"
 #include "aom_dsp/psnr.h"
@@ -16,14 +18,9 @@
 #include "av1/av1_iface_common.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
-#include "av1/encoder/thirdpass.h"
 #include "av1/common/blockd.h"
-
-#if CONFIG_THREE_PASS
 #include "common/ivfdec.h"
-#endif
 
-#if CONFIG_THREE_PASS
 static void setup_two_pass_stream_input(
     struct AvxInputContext **input_ctx_ptr, const char *input_file_name,
     struct aom_internal_error_info *err_info) {
@@ -64,7 +61,6 @@ static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
                                 ctx->err_info);
   }
 
-#if CONFIG_AV1_DECODER
   if (!ctx->decoder.iface) {
     aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo;
     if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) {
@@ -72,24 +68,17 @@ static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
                          "Failed to initialize decoder.");
     }
   }
-#else
-  aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
-                     "To utilize three-pass encoding, libaom must be built "
-                     "with CONFIG_AV1_DECODER=1.");
-#endif
 }
-#endif  // CONFIG_THREE_PASS
 
 // Return 0: success
 //        1: cannot read because this is end of file
 //       -1: failure to read the frame
 static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
-#if CONFIG_THREE_PASS
   if (!ctx->input_ctx || !ctx->decoder.iface) {
     init_third_pass(ctx);
   }
   if (!ctx->have_frame) {
-    if (ivf_read_frame(ctx->input_ctx->file, &ctx->buf, &ctx->bytes_in_buffer,
+    if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer,
                        &ctx->buffer_size, NULL) != 0) {
       if (feof(ctx->input_ctx->file)) {
         return 1;
@@ -101,10 +90,7 @@ static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
     ctx->end_frame = ctx->frame + ctx->bytes_in_buffer;
     ctx->have_frame = 1;
   }
-#else
-  aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
-                     "Cannot parse bitstream without CONFIG_THREE_PASS.");
-#endif
+
   Av1DecodeReturn adr;
   if (aom_codec_decode(&ctx->decoder, ctx->frame,
                        (unsigned int)ctx->bytes_in_buffer,
@@ -141,7 +127,7 @@ static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) {
     aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
                        "Third pass frame info ran out of available slots.");
   }
-  int frame_type_flags = 0;
+  aom_codec_frame_flags_t frame_type_flags = 0;
   if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS,
                         &frame_type_flags) != AOM_CODEC_OK) {
     aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
@@ -404,10 +390,8 @@ void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) {
   if (ctx->decoder.iface) {
     aom_codec_destroy(&ctx->decoder);
   }
-#if CONFIG_THREE_PASS
   if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file);
   aom_free(ctx->input_ctx);
-#endif
   if (ctx->buf) free(ctx->buf);
   for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) {
     free_frame_info(&ctx->frame_info[i]);
@@ -706,6 +690,119 @@ PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
   return corner_mi->partition;
 }
 
+#else   // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER)
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file) {
+  (void)ctx;
+  (void)file;
+  aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                     "To utilize three-pass encoding, libaom must be built "
+                     "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1.");
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) {
+  (void)cpi;
+  (void)is_read;
+}
+
+void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) {
+  (void)cpi;
+  (void)gf_index;
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+                                   THIRD_PASS_GOP_INFO *gop_info,
+                                   struct aom_internal_error_info *error) {
+  (void)second_pass_log_stream;
+  (void)gop_info;
+  (void)error;
+}
+
+void av1_read_second_pass_per_frame_info(
+    FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+    int frame_info_count, struct aom_internal_error_info *error) {
+  (void)second_pass_log_stream;
+  (void)frame_info_arr;
+  (void)frame_info_count;
+  (void)error;
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+  (void)ctx;
+  return 1;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+                              int fwidth, double *ratio_h, double *ratio_w) {
+  (void)ctx;
+  (void)fidx;
+  (void)fheight;
+  (void)fwidth;
+  (void)ratio_h;
+  (void)ratio_w;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+                                          int mi_row, int mi_col,
+                                          double ratio_h, double ratio_w) {
+  (void)ctx;
+  (void)fidx;
+  (void)mi_row;
+  (void)mi_col;
+  (void)ratio_h;
+  (void)ratio_w;
+  return NULL;
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+                                      double ratio_h, double ratio_w,
+                                      MV_REFERENCE_FRAME frame) {
+  (void)this_mi;
+  (void)ratio_h;
+  (void)ratio_w;
+  (void)frame;
+  int_mv mv;
+  mv.as_int = INVALID_MV;
+  return mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+                                                double ratio_h,
+                                                double ratio_w) {
+  (void)this_mi;
+  (void)ratio_h;
+  (void)ratio_w;
+  return BLOCK_INVALID;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+                                    double ratio_h, double ratio_w, int *mi_row,
+                                    int *mi_col) {
+  (void)third_pass_mi;
+  (void)ratio_h;
+  (void)ratio_w;
+  (void)mi_row;
+  (void)mi_col;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+                                               THIRD_PASS_MI_INFO *this_mi) {
+  (void)ctx;
+  (void)this_mi;
+  return PARTITION_INVALID;
+}
+#endif  // CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+
 #if CONFIG_BITRATE_ACCURACY
 static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb,
                              FILE *stream,
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index de6d452ab..ffac886e3 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -27,6 +27,157 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
+static AOM_INLINE int av1_fast_palette_color_index_context_on_edge(
+    const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+  const bool has_left = (c - 1 >= 0);
+  const bool has_above = (r - 1 >= 0);
+  assert(r > 0 || c > 0);
+  assert(has_above ^ has_left);
+  assert(color_idx);
+  (void)has_left;
+
+  const uint8_t color_neighbor = has_above
+                                     ? color_map[(r - 1) * stride + (c - 0)]
+                                     : color_map[(r - 0) * stride + (c - 1)];
+  // If the neighbor color has higher index than current color index, then we
+  // move up by 1.
+  const uint8_t current_color = *color_idx = color_map[r * stride + c];
+  if (color_neighbor > current_color) {
+    (*color_idx)++;
+  } else if (color_neighbor == current_color) {
+    *color_idx = 0;
+  }
+
+  // Get hash value of context.
+  // The non-diagonal neighbors get a weight of 2.
+  const uint8_t color_score = 2;
+  const uint8_t hash_multiplier = 1;
+  const uint8_t color_index_ctx_hash = color_score * hash_multiplier;
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      av1_palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx == 0);
+  (void)color_index_ctx;
+  return 0;
+}
+
+#define SWAP(i, j)                           \
+  do {                                       \
+    const uint8_t tmp_score = score_rank[i]; \
+    const uint8_t tmp_color = color_rank[i]; \
+    score_rank[i] = score_rank[j];           \
+    color_rank[i] = color_rank[j];           \
+    score_rank[j] = tmp_score;               \
+    color_rank[j] = tmp_color;               \
+  } while (0)
+#define INVALID_COLOR_IDX (UINT8_MAX)
+
+// A faster version of av1_get_palette_color_index_context used by the encoder
+// exploiting the fact that the encoder does not need to maintain a color order.
+static AOM_INLINE int av1_fast_palette_color_index_context(
+    const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+  assert(r > 0 || c > 0);
+
+  const bool has_above = (r - 1 >= 0);
+  const bool has_left = (c - 1 >= 0);
+  assert(has_above || has_left);
+  if (has_above ^ has_left) {
+    return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c,
+                                                        color_idx);
+  }
+
+  // This goes in the order of left, top, and top-left. This has the advantage
+  // that unless anything here are not distinct or invalid, this will already
+  // be in sorted order. Furthermore, if either of the first two is
+  // invalid, we know the last one is also invalid.
+  uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)];
+  color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)];
+  color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)];
+
+  // Aggregate duplicated values.
+  // Since our array is so small, using a couple if statements is faster
+  uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
+  uint8_t num_invalid_colors = 0;
+  if (color_neighbors[0] == color_neighbors[1]) {
+    scores[0] += scores[1];
+    color_neighbors[1] = INVALID_COLOR_IDX;
+    num_invalid_colors += 1;
+
+    if (color_neighbors[0] == color_neighbors[2]) {
+      scores[0] += scores[2];
+      num_invalid_colors += 1;
+    }
+  } else if (color_neighbors[0] == color_neighbors[2]) {
+    scores[0] += scores[2];
+    num_invalid_colors += 1;
+  } else if (color_neighbors[1] == color_neighbors[2]) {
+    scores[1] += scores[2];
+    num_invalid_colors += 1;
+  }
+
+  const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors;
+
+  uint8_t *color_rank = color_neighbors;
+  uint8_t *score_rank = scores;
+
+  // Sort everything
+  if (num_valid_colors > 1) {
+    if (color_neighbors[1] == INVALID_COLOR_IDX) {
+      scores[1] = scores[2];
+      color_neighbors[1] = color_neighbors[2];
+    }
+
+    // We need to swap the first two elements if they have the same score but
+    // the color indices are not in the right order
+    if (score_rank[0] < score_rank[1] ||
+        (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
+      SWAP(0, 1);
+    }
+    if (num_valid_colors > 2) {
+      if (score_rank[0] < score_rank[2]) {
+        SWAP(0, 2);
+      }
+      if (score_rank[1] < score_rank[2]) {
+        SWAP(1, 2);
+      }
+    }
+  }
+
+  // If any of the neighbor colors has higher index than current color index,
+  // then we move up by 1 unless the current color is the same as one of the
+  // neighbors.
+  const uint8_t current_color = *color_idx = color_map[r * stride + c];
+  for (int idx = 0; idx < num_valid_colors; idx++) {
+    if (color_rank[idx] > current_color) {
+      (*color_idx)++;
+    } else if (color_rank[idx] == current_color) {
+      *color_idx = idx;
+      break;
+    }
+  }
+
+  // Get hash value of context.
+  uint8_t color_index_ctx_hash = 0;
+  static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (int idx = 0; idx < num_valid_colors; ++idx) {
+    color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx = 9 - color_index_ctx_hash;
+  assert(color_index_ctx ==
+         av1_palette_color_index_context_lookup[color_index_ctx_hash]);
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
+#undef INVALID_COLOR_IDX
+#undef SWAP
+
 static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t,
                                  int plane, int calc_rate, int allow_update_cdf,
                                  FRAME_COUNTS *counts) {
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index b05477056..2ad11228c 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -196,7 +196,7 @@ void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
   }
 }
 
-static AOM_INLINE int64_t tpl_get_satd_cost(BitDepthInfo bd_info,
+static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info,
                                             int16_t *src_diff, int diff_stride,
                                             const uint8_t *src, int src_stride,
                                             const uint8_t *dst, int dst_stride,
@@ -364,8 +364,8 @@ static void get_rate_distortion(
   for (int plane = 0; plane < num_planes; ++plane) {
     struct macroblockd_plane *pd = &xd->plane[plane];
     BLOCK_SIZE bsize_plane =
-        ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x]
-                      [pd->subsampling_y];
+        av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x]
+                          [pd->subsampling_y];
 
     int dst_buffer_stride = rec_stride_pool[plane];
     int dst_mb_offset =
@@ -458,8 +458,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
 
   int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
 
-  int64_t best_intra_cost = INT64_MAX;
-  int64_t intra_cost;
+  int32_t best_intra_cost = INT32_MAX;
+  int32_t intra_cost;
   PREDICTION_MODE best_mode = DC_PRED;
 
   int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
@@ -569,6 +569,19 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
     }
   }
 
+  int rate_cost = 1;
+
+  if (cpi->use_ducky_encode) {
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, NULL);
+
+    tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->intra_rate = rate_cost;
+  }
+
   if (cpi->third_pass_ctx &&
       frame_offset < cpi->third_pass_ctx->frame_info_count &&
       tpl_data->frame_idx < gf_group->size) {
@@ -606,8 +619,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
 
   int best_rf_idx = -1;
   int_mv best_mv[2];
-  int64_t inter_cost;
-  int64_t best_inter_cost = INT64_MAX;
+  int32_t inter_cost;
+  int32_t best_inter_cost = INT32_MAX;
   int rf_idx;
   int_mv single_mv[INTER_REFS_PER_FRAME];
 
@@ -878,7 +891,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
     xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME;
   }
 
-  if (best_inter_cost < INT64_MAX) {
+  if (best_inter_cost < INT32_MAX) {
     xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
     xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
@@ -889,30 +902,31 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
           ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
           : NULL,
     };
-    int rate_cost = 1;
+    rate_cost = 1;
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion, NULL);
-    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_rate = rate_cost;
   }
 
   best_intra_cost = AOMMAX(best_intra_cost, 1);
   best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
-  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
-  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->inter_cost = best_inter_cost;
+  tpl_stats->intra_cost = best_intra_cost;
 
   tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
 
   // Final encode
-  int rate_cost = 0;
+  rate_cost = 0;
   const YV12_BUFFER_CONFIG *ref_frame_ptr[2];
 
   ref_frame_ptr[0] =
       best_mode == NEW_NEWMV
           ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
-          : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx] : NULL;
+      : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx]
+                         : NULL;
   ref_frame_ptr[1] =
       best_mode == NEW_NEWMV
           ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
@@ -922,12 +936,13 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
                       rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                       use_y_only_rate_distortion, tpl_txfm_stats);
 
-  tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
-  tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->recrf_rate = rate_cost;
 
   if (!is_inter_mode(best_mode)) {
-    tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
-    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_rate = rate_cost;
     tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
   }
 
@@ -943,7 +958,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
-    tpl_stats->cmp_recrf_rate[0] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[0] = rate_cost;
 
     tpl_stats->cmp_recrf_dist[0] =
         AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]);
@@ -964,7 +979,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
-    tpl_stats->cmp_recrf_rate[1] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[1] = rate_cost;
 
     tpl_stats->cmp_recrf_dist[1] =
         AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]);
@@ -1098,15 +1113,18 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
 
   int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref]
                                    : tpl_stats_ptr->srcrf_dist;
-  int64_t srcrf_rate = is_compound ? tpl_stats_ptr->cmp_recrf_rate[!ref]
-                                   : tpl_stats_ptr->srcrf_rate;
+  int64_t srcrf_rate =
+      is_compound
+          ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2)
+          : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
 
   int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist;
   int64_t mc_dep_dist =
       (int64_t)(tpl_stats_ptr->mc_dep_dist *
                 ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) /
                  tpl_stats_ptr->recrf_dist));
-  int64_t delta_rate = tpl_stats_ptr->recrf_rate - srcrf_rate;
+  int64_t delta_rate =
+      (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate;
   int64_t mc_dep_rate =
       av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
                           srcrf_dist, pix_num);
@@ -1200,6 +1218,10 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
   tpl_reset_src_ref_frames(tpl_data);
   av1_tile_init(&xd->tile, cm, 0, 0);
 
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
   // Setup scaling factor
   av1_setup_scale_factors_for_frame(
       &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
@@ -1260,9 +1282,15 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
   xd->block_ref_scale_factors[0] = &tpl_data->sf;
   xd->block_ref_scale_factors[1] = &tpl_data->sf;
 
-  const int base_qindex = pframe_qindex;
+  const int base_qindex =
+      cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex;
   // Get rd multiplier set up.
-  rdmult = (int)av1_compute_rd_mult(cpi, base_qindex);
+  rdmult = (int)av1_compute_rd_mult(
+      base_qindex, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
   if (rdmult < 1) rdmult = 1;
   av1_set_error_per_bit(&x->errorperbit, rdmult);
   av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex);
@@ -1276,9 +1304,12 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
   const FRAME_UPDATE_TYPE update_type =
       gf_group->update_type[cpi->gf_frame_index];
   tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
-                               bd_info.bit_depth, update_type, pframe_qindex) /
+                               bd_info.bit_depth, update_type, base_qindex) /
                            6;
 
+  if (cpi->use_ducky_encode)
+    tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx];
+
   av1_init_tpl_txfm_stats(tpl_txfm_stats);
 
   // Initialize x->mbmi_ext when compound predictions are enabled.
@@ -1593,7 +1624,8 @@ int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
     return 0;
   }
   if (gf_frame_index >= MAX_TPL_FRAME_IDX) {
-    assert(gf_frame_index < MAX_TPL_FRAME_IDX && "Invalid gf_frame_index\n");
+    // The sub-GOP length exceeds the TPL buffer capacity.
+    // Hence the TPL related functions are disabled hereafter.
     return 0;
   }
   return tpl_data->tpl_frame[gf_frame_index].is_valid;
@@ -1626,6 +1658,8 @@ void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
   AV1_COMMON *cm = &cpi->common;
   GF_GROUP *gf_group = &cpi->ppi->gf_group;
   int bottom_index, top_index;
+  if (cpi->use_ducky_encode) return;
+
   cm->current_frame.frame_type = frame_params->frame_type;
   for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
        ++gf_index) {
@@ -1842,6 +1876,10 @@ void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
                  cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
 
+  const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+  const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
   if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
   TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
   if (!tpl_frame->is_valid) return;
@@ -1877,11 +1915,24 @@ void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
   }
 
   const CommonQuantParams *quant_params = &cm->quant_params;
+
+  const int orig_qindex_rdmult =
+      quant_params->base_qindex + quant_params->y_dc_delta_q;
   const int orig_rdmult = av1_compute_rd_mult(
-      cpi, quant_params->base_qindex + quant_params->y_dc_delta_q);
-  const int new_rdmult =
-      av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
-                                   quant_params->y_dc_delta_q);
+      orig_qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
+  const int new_qindex_rdmult = quant_params->base_qindex +
+                                x->rdmult_delta_qindex +
+                                quant_params->y_dc_delta_q;
+  const int new_rdmult = av1_compute_rd_mult(
+      new_qindex_rdmult, cm->seq_params->bit_depth,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+      boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+      is_stat_consumption_stage(cpi));
+
   const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
 
   double scale_adj = log(scaling_factor) - log_sum / base_block_count;
@@ -2027,9 +2078,16 @@ int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
   const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth);
   const double target_qstep = leaf_qstep * qstep_ratio;
   int qindex = leaf_qindex;
-  for (qindex = leaf_qindex; qindex > 0; --qindex) {
-    const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
-    if (qstep <= target_qstep) break;
+  if (qstep_ratio < 1.0) {
+    for (qindex = leaf_qindex; qindex > 0; --qindex) {
+      const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (qstep <= target_qstep) break;
+    }
+  } else {
+    for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) {
+      const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (qstep >= target_qstep) break;
+    }
   }
   return qindex;
 }
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index b77a19ff7..71cc32059 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -104,20 +104,24 @@ typedef struct TplTxfmStats {
 } TplTxfmStats;
 
 typedef struct TplDepStats {
-  int64_t intra_cost;
-  int64_t inter_cost;
+  int64_t srcrf_sse;
   int64_t srcrf_dist;
+  int64_t recrf_sse;
   int64_t recrf_dist;
+  int64_t intra_sse;
+  int64_t intra_dist;
   int64_t cmp_recrf_dist[2];
-  int64_t srcrf_rate;
-  int64_t recrf_rate;
-  int64_t srcrf_sse;
-  int64_t cmp_recrf_rate[2];
   int64_t mc_dep_rate;
   int64_t mc_dep_dist;
-  int_mv mv[INTER_REFS_PER_FRAME];
-  int ref_frame_index[2];
   int64_t pred_error[INTER_REFS_PER_FRAME];
+  int32_t intra_cost;
+  int32_t inter_cost;
+  int32_t srcrf_rate;
+  int32_t recrf_rate;
+  int32_t intra_rate;
+  int32_t cmp_recrf_rate[2];
+  int_mv mv[INTER_REFS_PER_FRAME];
+  int8_t ref_frame_index[2];
 } TplDepStats;
 
 typedef struct TplDepFrame {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index ea0288b0a..74c9de2ae 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -3473,7 +3473,7 @@ static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
   assert(level >= 0 && level <= 2);
   int model_rate;
   int64_t model_dist;
-  int model_skip;
+  uint8_t model_skip;
   MACROBLOCKD *const xd = &x->e_mbd;
   model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
       cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL,
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index e3caf5bf4..b3689cf7d 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -89,7 +89,7 @@ int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
  * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
  * \param[in]    bsize          Current macroblock size
  * \param[in]    ref_best_rd    Best RD cost seen for this block so far
- * \return       Nothing is returned. The selected transform size and type will
+ * \remark       Nothing is returned. The selected transform size and type will
                  be saved in the MB_MODE_INFO structure
  */
 void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -111,7 +111,7 @@ void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
  * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
  * \param[in]    bs             Current macroblock size
  * \param[in]    ref_best_rd    Best RD cost seen for this block so far
- * \return       Nothing is returned. The selected transform size and type will
+ * \remark       Nothing is returned. The selected transform size and type will
                  be saved in the MB_MODE_INFO structure
  */
 void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -158,7 +158,7 @@ int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
  * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
                                 should be skipped
  *
- * \return       Nothing is returned. The RD results will be saved in rd_stats.
+ * \remark       Nothing is returned. The RD results will be saved in rd_stats.
  */
 void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                           RD_STATS *rd_stats, int64_t ref_best_rd,
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index ab27d1f88..a953a6fb4 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -383,14 +383,16 @@ static AOM_INLINE void fill_variance_4x4avg(const uint8_t *s, int sp,
                                             int highbd_flag,
 #endif
                                             int pixels_wide, int pixels_high,
-                                            int is_key_frame) {
+                                            int is_key_frame,
+                                            int border_offset_4x4) {
   int k;
   for (k = 0; k < 4; k++) {
     int x4_idx = x8_idx + ((k & 1) << 2);
     int y4_idx = y8_idx + ((k >> 1) << 2);
     unsigned int sse = 0;
     int sum = 0;
-    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+    if (x4_idx < pixels_wide - border_offset_4x4 &&
+        y4_idx < pixels_high - border_offset_4x4) {
       int s_avg;
       int d_avg = 128;
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -429,8 +431,7 @@ static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
 }
 
 static AOM_INLINE void tune_thresh_based_on_qindex_window(
-    int qindex, int th, int source_sad, int ag_idx, int64_t thresholds[]) {
-  const int win = 45;
+    int qindex, int th, int win, int fac, int64_t thresholds[]) {
   double weight;
 
   if (qindex < th - win)
@@ -443,7 +444,6 @@ static AOM_INLINE void tune_thresh_based_on_qindex_window(
       (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
   thresholds[2] =
       (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
-  const int fac = (!ag_idx && source_sad != kLowSad) ? 1 : 2;
   thresholds[3] =
       (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]);
 }
@@ -451,7 +451,9 @@ static AOM_INLINE void tune_thresh_based_on_qindex_window(
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
                                           int q, int content_lowsumdiff,
                                           int source_sad_nonrd,
-                                          int source_sad_rd, int segment_id) {
+                                          int source_sad_rd, int segment_id,
+                                          uint64_t blk_sad,
+                                          int lighting_change) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 120 : 1;
@@ -510,12 +512,12 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
   else
     threshold_base =
         scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
-                                  cm->height, cpi->svc.non_reference_frame);
+                                  cm->height, cpi->ppi->rtc_ref.non_reference_frame);
 #else
   // Increase base variance threshold based on content_state/sum_diff level.
-  threshold_base =
-      scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
-                                cm->height, cpi->svc.non_reference_frame);
+  threshold_base = scale_part_thresh_content(
+      threshold_base, cpi->oxcf.speed, cm->width, cm->height,
+      cpi->ppi->rtc_ref.non_reference_frame);
 #endif
   thresholds[0] = threshold_base >> 1;
   thresholds[1] = threshold_base;
@@ -565,11 +567,13 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
     thresholds[2] = (5 * threshold_base) >> 2;
   } else if (cm->width < 1920 && cm->height < 1080) {
     thresholds[2] = threshold_base << 1;
-  } else {
+  } else if (cm->width < 2560 && cm->height < 1440) {
     thresholds[2] = (5 * threshold_base) >> 1;
+  } else {
+    thresholds[2] = (7 * threshold_base) >> 1;
   }
   // Tune thresholds less or more aggressively to prefer larger partitions
-  if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 4) {
+  if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) {
     double weight;
     const int win = 20;
     if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
@@ -585,14 +589,23 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
       }
     }
     if (cm->width * cm->height <= 352 * 288) {
-      thresholds[3] = INT32_MAX;
+      thresholds[3] = INT64_MAX;
       if (segment_id == 0) {
         thresholds[1] <<= 2;
-        thresholds[2] <<= (source_sad_nonrd == kLowSad) ? 5 : 4;
+        thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4;
       } else {
         thresholds[1] <<= 1;
         thresholds[2] <<= 3;
       }
+      // Allow for split to 8x8 for superblocks where part of it has
+      // moving boundary. So allow for sb with source_sad above threshold,
+      // and avoid very large source_sad or high source content, to avoid
+      // too many 8x8 within superblock.
+      if (segment_id == 0 && cpi->rc.avg_source_sad < 25000 &&
+          blk_sad > 25000 && blk_sad < 50000 && !lighting_change) {
+        thresholds[2] = (3 * thresholds[2]) >> 2;
+        thresholds[3] = thresholds[2] << 3;
+      }
       // Condition the increase of partition thresholds on the segment
       // and the content. Avoid the increase for superblocks which have
       // high source sad, unless the whole frame has very high motion
@@ -602,7 +615,7 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
                (source_sad_nonrd != kHighSad ||
                 cpi->rc.avg_source_sad > 50000)) {
       thresholds[0] = (3 * thresholds[0]) >> 1;
-      thresholds[3] = INT32_MAX;
+      thresholds[3] = INT64_MAX;
       if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
         thresholds[1] =
             (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
@@ -616,16 +629,16 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
           (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
       thresholds[2] =
           (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
-      thresholds[3] = INT32_MAX;
+      thresholds[3] = INT64_MAX;
     }
   } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) {
-    tune_thresh_based_on_qindex_window(
-        current_qindex, QINDEX_LARGE_BLOCK_THR, source_sad_nonrd,
-        cpi->sf.rt_sf.prefer_large_partition_blocks - 2, thresholds);
+    thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0;
+    thresholds[2] =
+        (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2];
   } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) {
-    thresholds[3] <<= 2;
-    thresholds[1] <<= (source_sad_nonrd == kLowSad) ? 1 : 0;
-    thresholds[2] <<= (source_sad_nonrd == kLowSad) ? 1 : 0;
+    const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1;
+    tune_thresh_based_on_qindex_window(current_qindex, QINDEX_LARGE_BLOCK_THR,
+                                       45, fac, thresholds);
   }
   if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128))
     thresholds[3] = INT64_MAX;
@@ -916,7 +929,7 @@ void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
     return;
   } else {
     set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0,
-                       0, 0);
+                       0, 0, 0, 0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -1010,13 +1023,23 @@ static void fill_variance_tree_leaves(
   const int compute_minmax_variance = 0;
   const int segment_id = xd->mi[0]->segment_id;
   int pixels_wide = 128, pixels_high = 128;
-
+  int border_offset_4x4 = 0;
+  int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf;
   if (is_small_sb) {
     pixels_wide = 64;
     pixels_high = 64;
   }
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  temporal_denoising |= cpi->oxcf.noise_sensitivity;
+#endif
+  // For temporal filtering or temporal denoiser enabled: since the source
+  // is modified we need to avoid 4x4 avg along superblock boundary, since
+  // simd code will load 8 pixels for 4x4 avg and so can access source
+  // data outside superblock (while its being modified by temporal filter).
+  // Temporal filtering is never done on key frames.
+  if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4;
   for (int m = 0; m < num_64x64_blocks; m++) {
     const int x64_idx = ((m & 1) << 6);
     const int y64_idx = ((m >> 1) << 6);
@@ -1096,12 +1119,12 @@ static void fill_variance_tree_leaves(
             int x8_idx = x16_idx + ((k & 1) << 3);
             int y8_idx = y16_idx + ((k >> 1) << 3);
             VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
-            fill_variance_4x4avg(src, src_stride, dst, dst_stride, x8_idx,
-                                 y8_idx, vst2,
+            fill_variance_4x4avg(
+                src, src_stride, dst, dst_stride, x8_idx, y8_idx, vst2,
 #if CONFIG_AV1_HIGHBITDEPTH
-                                 xd->cur_buf->flags,
+                xd->cur_buf->flags,
 #endif
-                                 pixels_wide, pixels_high, is_key_frame);
+                pixels_wide, pixels_high, is_key_frame, border_offset_4x4);
           }
         }
       }
@@ -1110,7 +1133,8 @@ static void fill_variance_tree_leaves(
 }
 
 static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
-                         unsigned int *y_sad_g, unsigned int *y_sad_last,
+                         unsigned int *y_sad_g, unsigned int *y_sad_alt,
+                         unsigned int *y_sad_last,
                          MV_REFERENCE_FRAME *ref_frame_partition, int mi_row,
                          int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
@@ -1118,17 +1142,24 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
   const int num_planes = av1_num_planes(cm);
   const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
-  // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
-  // is!!
   MB_MODE_INFO *mi = xd->mi[0];
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
   assert(yv12 != NULL);
   const YV12_BUFFER_CONFIG *yv12_g = NULL;
-
-  // For non-SVC GOLDEN is another temporal reference. Check if it should be
-  // used as reference for partitioning.
-  if (!cpi->ppi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
-      x->content_state_sb.source_sad_nonrd != kZeroSad) {
+  const YV12_BUFFER_CONFIG *yv12_alt = NULL;
+  // Check if LAST is a reference. For spatial layers always use it as
+  // reference scaling (golden or altref being lower resolution) is not
+  // handled/check here.
+  int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) ||
+                     cpi->svc.number_spatial_layers > 1;
+  int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG;
+  int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config ||
+                    cpi->sf.rt_sf.use_nonrd_altref_frame;
+
+  // For 1 spatial layer: GOLDEN is another temporal reference.
+  // Check if it should be used as reference for partitioning.
+  if (cpi->svc.number_spatial_layers == 1 && use_golden_ref &&
+      (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
     yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
     if (yv12_g && yv12_g != yv12) {
       av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -1139,30 +1170,47 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
     }
   }
 
-  av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                       get_ref_scale_factors(cm, LAST_FRAME), num_planes);
-  mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE_FRAME;
-  mi->bsize = cm->seq_params->sb_size;
-  mi->mv[0].as_int = 0;
-  mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-  if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
-    if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
-      const MV dummy_mv = { 0, 0 };
-      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
-                                             mi_row, mi_col, &dummy_mv);
+  // For 1 spatial layer: ALTREF is another temporal reference.
+  // Check if it should be used as reference for partitioning.
+  if (cpi->svc.number_spatial_layers == 1 && use_alt_ref &&
+      (cpi->ref_frame_flags & AOM_ALT_FLAG) &&
+      (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+    yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+    if (yv12_alt && yv12_alt != yv12) {
+      av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
+                           get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
+      *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
     }
   }
-  if (*y_sad == UINT_MAX) {
-    *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
-        x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
-        xd->plane[0].pre[0].stride);
+
+  if (use_last_ref) {
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    mi->bsize = cm->seq_params->sb_size;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
+      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+        const MV dummy_mv = { 0, 0 };
+        *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
+                                               mi_row, mi_col, &dummy_mv);
+      }
+    }
+    if (*y_sad == UINT_MAX) {
+      *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    }
+    *y_sad_last = *y_sad;
   }
-  *y_sad_last = *y_sad;
 
-  // Pick the ref frame for partitioning, use golden frame only if its
-  // lower sad.
-  if (*y_sad_g < 0.9 * *y_sad) {
+  // Pick the ref frame for partitioning, use golden or altref frame only if
+  // its lower sad, bias to LAST with factor 0.9.
+  if (*y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt) {
     av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                          get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
     mi->ref_frame[0] = GOLDEN_FRAME;
@@ -1170,6 +1218,14 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
     *y_sad = *y_sad_g;
     *ref_frame_partition = GOLDEN_FRAME;
     x->nonrd_prune_ref_frame_search = 0;
+  } else if (*y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g) {
+    av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
+                         get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
+    mi->ref_frame[0] = ALTREF_FRAME;
+    mi->mv[0].as_int = 0;
+    *y_sad = *y_sad_alt;
+    *ref_frame_partition = ALTREF_FRAME;
+    x->nonrd_prune_ref_frame_search = 0;
   } else {
     *ref_frame_partition = LAST_FRAME;
     x->nonrd_prune_ref_frame_search =
@@ -1181,7 +1237,7 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
                                   cm->seq_params->sb_size, AOM_PLANE_Y,
-                                  AOM_PLANE_V);
+                                  num_planes - 1);
   }
 }
 
@@ -1205,6 +1261,18 @@ static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
              : PART_EVAL_ONLY_NONE;
 }
 
+static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad(
+    int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) {
+  if (set_zeromv_skip_based_on_source_sad == 0) return false;
+
+  if (set_zeromv_skip_based_on_source_sad >= 2)
+    return source_sad_nonrd <= kVeryLowSad;
+  else if (set_zeromv_skip_based_on_source_sad >= 1)
+    return source_sad_nonrd == kZeroSad;
+
+  return false;
+}
+
 int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                                       ThreadData *td, MACROBLOCK *x, int mi_row,
                                       int mi_col) {
@@ -1250,6 +1318,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
 
   unsigned int y_sad = UINT_MAX;
   unsigned int y_sad_g = UINT_MAX;
+  unsigned int y_sad_alt = UINT_MAX;
   unsigned int y_sad_last = UINT_MAX;
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
 
@@ -1267,6 +1336,17 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[64];
   const int segment_id = xd->mi[0]->segment_id;
+  uint64_t blk_sad = 0;
+  if (cpi->src_sad_blk_64x64 != NULL && !cpi->ppi->use_svc) {
+    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols =
+        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sbi_col = mi_col / sb_size_by_mb;
+    const int sbi_row = mi_row / sb_size_by_mb;
+    blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+  }
 
   if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
       cyclic_refresh_segment_id_boosted(segment_id)) {
@@ -1274,12 +1354,14 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
     set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
                        x->content_state_sb.source_sad_nonrd,
-                       x->content_state_sb.source_sad_rd, 1);
+                       x->content_state_sb.source_sad_rd, 1, blk_sad,
+                       x->content_state_sb.lighting_change);
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
                        x->content_state_sb.low_sumdiff,
                        x->content_state_sb.source_sad_nonrd,
-                       x->content_state_sb.source_sad_rd, 0);
+                       x->content_state_sb.source_sad_rd, 0, blk_sad,
+                       x->content_state_sb.lighting_change);
   }
 
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
@@ -1310,8 +1392,8 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
   }
 
   if (!is_key_frame) {
-    setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_last, &ref_frame_partition,
-                 mi_row, mi_col);
+    setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last,
+                 &ref_frame_partition, mi_row, mi_col);
 
     MB_MODE_INFO *mi = xd->mi[0];
     // Use reference SB directly for zero mv.
@@ -1333,32 +1415,40 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
   chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, is_key_frame, zero_motion,
                uv_sad);
 
-  x->force_zeromv_skip = 0;
-  const unsigned int thresh_exit_part =
-      (cm->seq_params->sb_size == BLOCK_64X64) ? 5000 : 10000;
+  x->force_zeromv_skip_for_sb = 0;
+  const bool is_set_force_zeromv_skip =
+      is_set_force_zeromv_skip_based_on_src_sad(
+          cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad,
+          x->content_state_sb.source_sad_nonrd);
+
   // If the superblock is completely static (zero source sad) and
   // the y_sad (relative to LAST ref) is very small, take the sb_size partition
   // and exit, and force zeromv_last skip mode for nonrd_pickmode.
-  // Only do this when the cyclic refresh is applied, and only on the base
-  // segment (so the QP-boosted segment can still contnue cleaning/ramping
-  // up the quality). Condition on color uv_sad is also added.
+  // Only do this on the base segment (so the QP-boosted segment, if applied,
+  // can still continue cleaning/ramping up the quality).
+  // Condition on color uv_sad is also added.
   if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv &&
-      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
-      cpi->cyclic_refresh->apply_cyclic_refresh &&
-      segment_id == CR_SEGMENT_ID_BASE &&
-      x->content_state_sb.source_sad_nonrd == kZeroSad &&
-      ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0 &&
-      y_sad < thresh_exit_part && uv_sad[0]<(3 * thresh_exit_part)>> 2 &&
-      uv_sad[1]<(3 * thresh_exit_part)>> 2) {
+      cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE &&
+      is_set_force_zeromv_skip && ref_frame_partition == LAST_FRAME &&
+      xd->mi[0]->mv[0].as_int == 0) {
     const int block_width = mi_size_wide[cm->seq_params->sb_size];
     const int block_height = mi_size_high[cm->seq_params->sb_size];
+    const unsigned int thresh_exit_part_y =
+        cpi->zeromv_skip_thresh_exit_part[bsize];
+    const unsigned int thresh_exit_part_uv =
+        CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
     if (mi_col + block_width <= tile->mi_col_end &&
-        mi_row + block_height <= tile->mi_row_end) {
+        mi_row + block_height <= tile->mi_row_end &&
+        y_sad < thresh_exit_part_y && uv_sad[0] < thresh_exit_part_uv &&
+        uv_sad[1] < thresh_exit_part_uv) {
       set_block_size(cpi, mi_row, mi_col, bsize);
-      x->force_zeromv_skip = 1;
+      x->force_zeromv_skip_for_sb = 1;
       if (vt2) aom_free(vt2);
       if (vt) aom_free(vt);
       return 0;
+    } else if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+               cpi->sf.rt_sf.part_early_exit_zeromv >= 2) {
+      x->force_zeromv_skip_for_sb = 2;
     }
   }
 
@@ -1407,6 +1497,10 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
       // (some threshold of) the average variance over the sub-16x16 blocks,
       // then force this block to split. This also forces a split on the upper
       // (64x64) level.
+      uint64_t frame_sad_thresh = 20000;
+      if (cpi->svc.number_temporal_layers > 2 &&
+          cpi->svc.temporal_layer_id == 0)
+        frame_sad_thresh = frame_sad_thresh << 1;
       if (force_split[5 + m2 + i] == PART_EVAL_ALL) {
         get_variance(&vt->split[m].split[i].part_variances.none);
         var_32x32 = vt->split[m].split[i].part_variances.none.variance;
@@ -1428,7 +1522,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                      maxvar_16x16[m][i] > thresholds[2]) ||
                     (cpi->sf.rt_sf.prefer_large_partition_blocks &&
                      x->content_state_sb.source_sad_nonrd > kLowSad &&
-                     cpi->rc.frame_source_sad < 20000 &&
+                     cpi->rc.frame_source_sad < frame_sad_thresh &&
                      maxvar_16x16[m][i] > (thresholds[2] >> 4) &&
                      maxvar_16x16[m][i] > (minvar_16x16[m][i] << 2)))) {
           force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
diff --git a/av1/encoder/var_based_part.h b/av1/encoder/var_based_part.h
index 517675134..7febc0eb7 100644
--- a/av1/encoder/var_based_part.h
+++ b/av1/encoder/var_based_part.h
@@ -28,6 +28,8 @@ extern "C" {
   100  // Use increased thresholds for midres for speed 9 when qindex is above
        // this threshold
 
+#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \
+  ((3 * (thresh_exit_part)) >> 2)
 /*!\brief Set the thresholds for variance based partition.
  *
  * Set the variance split thresholds for following the block sizes:
@@ -44,7 +46,7 @@ extern "C" {
  * \param[in]      q                  q index
  * \param[in]      content_lowsumdiff Low sumdiff flag for superblock
  *
- * \return Returns the set of thresholds in \c cpi->vbp_info.thresholds.
+ * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds.
  */
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
                                            int content_lowsumdiff);
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index fa5c66abf..b898fc60d 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -1574,6 +1574,332 @@ static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
   fadst16x16_new_avx2       // H_FLIPADST
 };
 
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride,
+                                               __m128i *out, int bit) {
+  out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
+  out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
+  out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
+  out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
+  out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
+  out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
+  out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
+  out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
+  out[0] = _mm_slli_epi16(out[0], bit);
+  out[1] = _mm_slli_epi16(out[1], bit);
+  out[2] = _mm_slli_epi16(out[2], bit);
+  out[3] = _mm_slli_epi16(out[3], bit);
+  out[4] = _mm_slli_epi16(out[4], bit);
+  out[5] = _mm_slli_epi16(out[5], bit);
+  out[6] = _mm_slli_epi16(out[6], bit);
+  out[7] = _mm_slli_epi16(out[7], bit);
+}
+
+static INLINE void load_buffer_and_flip_round_shift(const int16_t *in,
+                                                    int stride, __m128i *out,
+                                                    int bit) {
+  out[7] = load_16bit_to_16bit(in + 0 * stride);
+  out[6] = load_16bit_to_16bit(in + 1 * stride);
+  out[5] = load_16bit_to_16bit(in + 2 * stride);
+  out[4] = load_16bit_to_16bit(in + 3 * stride);
+  out[3] = load_16bit_to_16bit(in + 4 * stride);
+  out[2] = load_16bit_to_16bit(in + 5 * stride);
+  out[1] = load_16bit_to_16bit(in + 6 * stride);
+  out[0] = load_16bit_to_16bit(in + 7 * stride);
+  out[7] = _mm_slli_epi16(out[7], bit);
+  out[6] = _mm_slli_epi16(out[6], bit);
+  out[5] = _mm_slli_epi16(out[5], bit);
+  out[4] = _mm_slli_epi16(out[4], bit);
+  out[3] = _mm_slli_epi16(out[3], bit);
+  out[2] = _mm_slli_epi16(out[2], bit);
+  out[1] = _mm_slli_epi16(out[1], bit);
+  out[0] = _mm_slli_epi16(out[0], bit);
+}
+
+#define TRANSPOSE_8X8_AVX2()                                         \
+  {                                                                  \
+    /* aa0:    00 10 01 11  02 12 03 13 | 40 50 41 51  42 52 43 53*/ \
+    /* aa1:    04 14 05 15  06 16 07 17 | 44 54 45 55  46 56 47 57*/ \
+    /* aa2:    20 30 21 31  22 32 23 33 | 60 70 61 71  62 72 63 73*/ \
+    /* aa3:    24 34 25 35  26 36 27 37 | 64 74 65 75  66 76 67 77*/ \
+    const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1);               \
+    const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1);               \
+    const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3);               \
+    const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3);               \
+    /* Unpack 32 bit elements resulting in: */                       \
+    /* bb0: 00 10 20 30  01 11 21 31 | 40 50 60 70  41 51 61 71*/    \
+    /* bb1: 02 12 22 32  03 13 23 33 | 42 52 62 72  43 53 63 73*/    \
+    /* bb2: 04 14 24 34  05 15 25 35 | 44 54 64 74  45 55 65 75*/    \
+    /* bb2: 06 16 26 36  07 17 27 37 | 46 56 66 76  47 57 67 77*/    \
+    const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2);             \
+    const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2);             \
+    const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3);             \
+    const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3);             \
+    /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/       \
+    /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/       \
+    /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/       \
+    /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/       \
+    c0 = _mm256_permute4x64_epi64(bb0, 0xd8);                        \
+    c1 = _mm256_permute4x64_epi64(bb1, 0xd8);                        \
+    c2 = _mm256_permute4x64_epi64(bb2, 0xd8);                        \
+    c3 = _mm256_permute4x64_epi64(bb3, 0xd8);                        \
+  }
+
+static INLINE void transpose_round_shift_flip_8x8(__m128i *const in,
+                                                  __m128i *const out, int bit) {
+  __m256i c0, c1, c2, c3;
+  bit = -bit;
+  const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+  const __m256i s04 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+  const __m256i s15 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+  const __m256i s26 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+  const __m256i s37 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+  const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+  const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+  const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+  const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+  // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
+  // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
+  // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
+  // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
+  const __m256i b0 = _mm256_srai_epi16(a0, bit);
+  const __m256i b1 = _mm256_srai_epi16(a1, bit);
+  const __m256i b2 = _mm256_srai_epi16(a2, bit);
+  const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+  TRANSPOSE_8X8_AVX2()
+
+  // Unpack 64 bit elements resulting in:
+  // out[7]: 00 10 20 30  40 50 60 70
+  // out[6]: 01 11 21 31  41 51 61 71
+  // out[5]: 02 12 22 32  42 52 62 72
+  // out[4]: 03 13 23 33  43 53 63 73
+  // out[3]: 04 14 24 34  44 54 64 74
+  // out[2]: 05 15 25 35  45 55 65 75
+  // out[1]: 06 16 26 36  46 56 66 76
+  // out[0]: 07 17 27 37  47 57 67 77
+  out[7] = _mm256_castsi256_si128(c0);
+  out[6] = _mm256_extractf128_si256(c0, 1);
+  out[5] = _mm256_castsi256_si128(c1);
+  out[4] = _mm256_extractf128_si256(c1, 1);
+  out[3] = _mm256_castsi256_si128(c2);
+  out[2] = _mm256_extractf128_si256(c2, 1);
+  out[1] = _mm256_castsi256_si128(c3);
+  out[0] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void transpose_round_shift_8x8(__m128i *const in,
+                                             __m128i *const out, int bit) {
+  __m256i c0, c1, c2, c3;
+  bit = -bit;
+  const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+  const __m256i s04 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+  const __m256i s15 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+  const __m256i s26 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+  const __m256i s37 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+  const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+  const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+  const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+  const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+  // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
+  // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
+  // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
+  // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
+  const __m256i b0 = _mm256_srai_epi16(a0, bit);
+  const __m256i b1 = _mm256_srai_epi16(a1, bit);
+  const __m256i b2 = _mm256_srai_epi16(a2, bit);
+  const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+  TRANSPOSE_8X8_AVX2()
+  // Unpack 64 bit elements resulting in:
+  // out[7]: 00 10 20 30  40 50 60 70
+  // out[6]: 01 11 21 31  41 51 61 71
+  // out[5]: 02 12 22 32  42 52 62 72
+  // out[4]: 03 13 23 33  43 53 63 73
+  // out[3]: 04 14 24 34  44 54 64 74
+  // out[2]: 05 15 25 35  45 55 65 75
+  // out[1]: 06 16 26 36  46 56 66 76
+  // out[0]: 07 17 27 37  47 57 67 77
+  out[0] = _mm256_castsi256_si128(c0);
+  out[1] = _mm256_extractf128_si256(c0, 1);
+  out[2] = _mm256_castsi256_si128(c1);
+  out[3] = _mm256_extractf128_si256(c1, 1);
+  out[4] = _mm256_castsi256_si128(c2);
+  out[5] = _mm256_extractf128_si256(c2, 1);
+  out[6] = _mm256_castsi256_si128(c3);
+  out[7] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void transpose_16bit_and_store_8x8(const __m128i *const in,
+                                                 int32_t *output) {
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // s04: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
+  // s15: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
+  // s26: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
+  // s37: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
+  const __m256i s04 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+  const __m256i s15 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+  const __m256i s26 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+  const __m256i s37 =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+  // a0:    00 10 01 11  02 12 03 13 | 40 50 41 51  42 52 43 53
+  // a1:    04 14 05 15  06 16 07 17 | 44 54 45 55  46 56 47 57
+  // a2:    20 30 21 31  22 32 23 33 | 60 70 61 71  62 72 63 73
+  // a3:    24 34 25 35  26 36 27 37 | 64 74 65 75  66 76 67 77
+  const __m256i a0 = _mm256_unpacklo_epi16(s04, s15);
+  const __m256i a1 = _mm256_unpackhi_epi16(s04, s15);
+  const __m256i a2 = _mm256_unpacklo_epi16(s26, s37);
+  const __m256i a3 = _mm256_unpackhi_epi16(s26, s37);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31 | 40 50 60 70  41 51 61 71
+  // b1: 02 12 22 32  03 13 23 33 | 42 52 62 72  43 53 63 73
+  // b2: 04 14 24 34  05 15 25 35 | 44 54 64 74  45 55 65 75
+  // b2: 06 16 26 36  07 17 27 37 | 46 56 66 76  47 57 67 77
+  const __m256i b0 = _mm256_unpacklo_epi32(a0, a2);
+  const __m256i b1 = _mm256_unpackhi_epi32(a0, a2);
+  const __m256i b2 = _mm256_unpacklo_epi32(a1, a3);
+  const __m256i b3 = _mm256_unpackhi_epi32(a1, a3);
+
+  // 00 10 20 30  40 50 60 70
+  // 01 11 21 31  41 51 61 71
+  // 02 12 22 32  42 52 62 72
+  // 03 13 23 33  43 53 63 73
+  // 04 14 24 34  44 54 64 74
+  // 05 15 25 35  45 55 65 75
+  // 06 16 26 36  46 56 66 76
+  // 07 17 27 37  47 57 67 77
+  const __m256i a_lo = _mm256_unpacklo_epi16(b0, b0);
+  const __m256i a_hi = _mm256_unpackhi_epi16(b0, b0);
+  const __m256i b_lo = _mm256_unpacklo_epi16(b1, b1);
+  const __m256i b_hi = _mm256_unpackhi_epi16(b1, b1);
+  const __m256i c_lo = _mm256_unpacklo_epi16(b2, b2);
+  const __m256i c_hi = _mm256_unpackhi_epi16(b2, b2);
+  const __m256i d_lo = _mm256_unpacklo_epi16(b3, b3);
+  const __m256i d_hi = _mm256_unpackhi_epi16(b3, b3);
+
+  const __m256i a_1 = _mm256_srai_epi32(a_lo, 16);
+  const __m256i a_2 = _mm256_srai_epi32(a_hi, 16);
+  const __m256i a_3 = _mm256_srai_epi32(b_lo, 16);
+  const __m256i a_4 = _mm256_srai_epi32(b_hi, 16);
+  const __m256i a_5 = _mm256_srai_epi32(c_lo, 16);
+  const __m256i a_6 = _mm256_srai_epi32(c_hi, 16);
+  const __m256i a_7 = _mm256_srai_epi32(d_lo, 16);
+  const __m256i a_8 = _mm256_srai_epi32(d_hi, 16);
+
+  _mm256_store_si256((__m256i *)output, a_1);
+  _mm256_store_si256((__m256i *)(output + 8), a_2);
+  _mm256_store_si256((__m256i *)(output + 16), a_3);
+  _mm256_store_si256((__m256i *)(output + 24), a_4);
+  _mm256_store_si256((__m256i *)(output + 32), a_5);
+  _mm256_store_si256((__m256i *)(output + 40), a_6);
+  _mm256_store_si256((__m256i *)(output + 48), a_7);
+  _mm256_store_si256((__m256i *)(output + 56), a_8);
+}
+
+static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // Condition to check shift bit is avoided while round shifting, by assuming
+  // that shift[0] will always be positive.
+  assert(shift[0] > 0);
+  if (ud_flip)
+    load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
+  else
+    load_buffer_and_round_shift(input, stride, buf0, shift[0]);
+
+  col_txfm(buf0, buf0, cos_bit_col);
+  // Condition to check shift bit is avoided while round shifting, by assuming
+  // that shift[1] will always be negative.
+  assert(shift[1] < 0);
+
+  if (lr_flip) {
+    transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
+  } else {
+    transpose_round_shift_8x8(buf0, buf1, shift[1]);
+  }
+
+  buf = buf1;
+  row_txfm(buf, buf, cos_bit_row);
+
+  // Round and shift operation is avoided here as the shift bit is assumed to be
+  // zero always.
+  assert(shift[2] == 0);
+  transpose_16bit_and_store_8x8(buf, output);
+}
+
 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
@@ -2781,7 +3107,7 @@ static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
 
 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
   av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
-  av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
+  av1_lowbd_fwd_txfm2d_8x8_avx2,   // 8x8 transform
   lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
   lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
   lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index 694e6131c..748ef4d46 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -150,71 +150,6 @@ static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
   output[7] = x4[7];
 }
 
-static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
-                             int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
-
-  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
-  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
-  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
-  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
-
-  // stage 1
-  __m128i x1[8];
-  x1[0] = _mm_adds_epi16(input[0], input[7]);
-  x1[7] = _mm_subs_epi16(input[0], input[7]);
-  x1[1] = _mm_adds_epi16(input[1], input[6]);
-  x1[6] = _mm_subs_epi16(input[1], input[6]);
-  x1[2] = _mm_adds_epi16(input[2], input[5]);
-  x1[5] = _mm_subs_epi16(input[2], input[5]);
-  x1[3] = _mm_adds_epi16(input[3], input[4]);
-  x1[4] = _mm_subs_epi16(input[3], input[4]);
-
-  // stage 2
-  __m128i x2[8];
-  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
-  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
-  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
-  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
-  x2[4] = x1[4];
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
-  x2[7] = x1[7];
-
-  // stage 3
-  __m128i x3[8];
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
-  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
-  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
-  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
-  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
-  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
-
-  // stage 4
-  __m128i x4[8];
-  x4[0] = x3[0];
-  x4[1] = x3[1];
-  x4[2] = x3[2];
-  x4[3] = x3[3];
-  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
-  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
-
-  // stage 5
-  output[0] = x4[0];
-  output[1] = x4[4];
-  output[2] = x4[2];
-  output[3] = x4[6];
-  output[4] = x4[1];
-  output[5] = x4[5];
-  output[6] = x4[3];
-  output[7] = x4[7];
-}
-
 static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
                               int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1425,7 +1360,7 @@ static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
   const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
   const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
   const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
-  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __zero = _mm_setzero_si128();
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
   const __m128i in7 = _mm_add_epi16(input[0], input[1]);
   __m128i u[8], v[8];
@@ -1573,7 +1508,7 @@ static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
   const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
   const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
   const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
-  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __zero = _mm_setzero_si128();
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
   const __m128i in7 = _mm_add_epi16(input[0], input[1]);
   __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
@@ -1643,95 +1578,6 @@ static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
   output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
 }
 
-static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const __m128i __zero = _mm_setzero_si128();
-  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
-
-  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
-  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
-  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
-  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
-  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
-  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
-  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
-  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
-  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
-
-  // stage 1
-  __m128i x1[8];
-  x1[0] = input[0];
-  x1[1] = _mm_subs_epi16(__zero, input[7]);
-  x1[2] = _mm_subs_epi16(__zero, input[3]);
-  x1[3] = input[4];
-  x1[4] = _mm_subs_epi16(__zero, input[1]);
-  x1[5] = input[6];
-  x1[6] = input[2];
-  x1[7] = _mm_subs_epi16(__zero, input[5]);
-
-  // stage 2
-  __m128i x2[8];
-  x2[0] = x1[0];
-  x2[1] = x1[1];
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
-  x2[4] = x1[4];
-  x2[5] = x1[5];
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
-
-  // stage 3
-  __m128i x3[8];
-  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
-  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
-  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
-  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
-  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
-  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
-  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
-  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
-
-  // stage 4
-  __m128i x4[8];
-  x4[0] = x3[0];
-  x4[1] = x3[1];
-  x4[2] = x3[2];
-  x4[3] = x3[3];
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
-  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
-
-  // stage 5
-  __m128i x5[8];
-  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
-  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
-  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
-  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
-  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
-  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
-  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
-  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
-
-  // stage 6
-  __m128i x6[8];
-  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
-  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
-  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
-  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
-
-  // stage 7
-  output[0] = x6[1];
-  output[1] = x6[6];
-  output[2] = x6[3];
-  output[3] = x6[4];
-  output[4] = x6[5];
-  output[5] = x6[2];
-  output[6] = x6[7];
-  output[7] = x6[0];
-}
-
 static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
                                int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.h b/av1/encoder/x86/av1_fwd_txfm_sse2.h
index a0e32f538..3cb869a8f 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -71,6 +71,140 @@ static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
   output[7] = _mm_adds_epi16(input[7], input[7]);
 }
 
+static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+                                    int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4 and 5
+  output[0] = x3[0];
+  output[4] = x3[1];
+  output[2] = x3[2];
+  output[6] = x3[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]);
+}
+
+static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+  // stage 5, 6 and 7
+  output[7] = _mm_adds_epi16(x4[0], x4[4]);
+  output[3] = _mm_subs_epi16(x4[0], x4[4]);
+  output[0] = _mm_adds_epi16(x4[1], x4[5]);
+  output[4] = _mm_subs_epi16(x4[1], x4[5]);
+  output[5] = _mm_adds_epi16(x4[2], x4[6]);
+  output[1] = _mm_subs_epi16(x4[2], x4[6]);
+  output[2] = _mm_adds_epi16(x4[3], x4[7]);
+  output[6] = _mm_subs_epi16(x4[3], x4[7]);
+
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7],
+              output[0]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5],
+              output[2]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3],
+              output[4]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1],
+              output[6]);
+}
+
 static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
                                           int8_t cos_bit) {
   (void)cos_bit;
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c
index 23a7369e9..a2db22214 100644
--- a/av1/encoder/x86/av1_k_means_avx2.c
+++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -13,71 +13,86 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/x86/synonyms.h"
 
-void av1_calc_indices_dim1_avx2(const int *data, const int *centroids,
-                                uint8_t *indices, int n, int k) {
+static int64_t k_means_horizontal_sum_avx2(__m256i a) {
+  const __m128i low = _mm256_castsi256_si128(a);
+  const __m128i high = _mm256_extracti128_si256(a, 1);
+  const __m128i sum = _mm_add_epi64(low, high);
+  const __m128i sum_high = _mm_unpackhi_epi64(sum, sum);
+  int64_t res;
+  _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high));
+  return res;
+}
+
+void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
   __m256i dist[PALETTE_MAX_SIZE];
   const __m256i v_zero = _mm256_setzero_si256();
+  __m256i sum = _mm256_setzero_si256();
 
-  for (int i = 0; i < n; i += 8) {
+  for (int i = 0; i < n; i += 16) {
     __m256i ind = _mm256_loadu_si256((__m256i *)data);
     for (int j = 0; j < k; j++) {
-      __m256i cent = _mm256_set1_epi32((uint32_t)centroids[j]);
-      __m256i d1 = _mm256_sub_epi32(ind, cent);
-      dist[j] = _mm256_mullo_epi32(d1, d1);
+      __m256i cent = _mm256_set1_epi16(centroids[j]);
+      __m256i d1 = _mm256_sub_epi16(ind, cent);
+      dist[j] = _mm256_abs_epi16(d1);
     }
 
     ind = _mm256_setzero_si256();
     for (int j = 1; j < k; j++) {
-      __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
-      __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
-      __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
-      dist[0] = _mm256_or_si256(dist1, dist2);
-      __m256i ind1 = _mm256_set1_epi32(j);
+      __m256i cmp = _mm256_cmpgt_epi16(dist[0], dist[j]);
+      dist[0] = _mm256_min_epi16(dist[0], dist[j]);
+      __m256i ind1 = _mm256_set1_epi16(j);
       ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
                             _mm256_and_si256(cmp, ind1));
     }
 
-    __m256i p1 = _mm256_packus_epi32(ind, v_zero);
+    __m256i p1 = _mm256_packus_epi16(ind, v_zero);
     __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
-    __m256i p2 = _mm256_packus_epi16(px, v_zero);
-    __m128i d1 = _mm256_extracti128_si256(p2, 0);
+    __m128i d1 = _mm256_extracti128_si256(px, 0);
 
-    _mm_storel_epi64((__m128i *)indices, d1);
+    _mm_storeu_si128((__m128i *)indices, d1);
 
-    indices += 8;
-    data += 8;
+    if (total_dist) {
+      // Square, convert to 32 bit and add together.
+      dist[0] = _mm256_madd_epi16(dist[0], dist[0]);
+      // Convert to 64 bit and add to sum.
+      const __m256i dist1 = _mm256_unpacklo_epi32(dist[0], v_zero);
+      const __m256i dist2 = _mm256_unpackhi_epi32(dist[0], v_zero);
+      sum = _mm256_add_epi64(sum, dist1);
+      sum = _mm256_add_epi64(sum, dist2);
+    }
+
+    indices += 16;
+    data += 16;
+  }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_avx2(sum);
   }
 }
 
-void av1_calc_indices_dim2_avx2(const int *data, const int *centroids,
-                                uint8_t *indices, int n, int k) {
+void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
   __m256i dist[PALETTE_MAX_SIZE];
   const __m256i v_zero = _mm256_setzero_si256();
-  const __m256i v_permute = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
+  __m256i sum = _mm256_setzero_si256();
 
   for (int i = 0; i < n; i += 8) {
-    __m256i ind1 = _mm256_loadu_si256((__m256i *)data);
-    __m256i ind2 = _mm256_loadu_si256((__m256i *)(data + 8));
+    __m256i ind = _mm256_loadu_si256((__m256i *)data);
     for (int j = 0; j < k; j++) {
-      __m128i cent0 = _mm_loadl_epi64((__m128i const *)&centroids[2 * j]);
-      __m256i cent1 = _mm256_inserti128_si256(v_zero, cent0, 0);
-      cent1 = _mm256_inserti128_si256(cent1, cent0, 1);
-      __m256i cent = _mm256_unpacklo_epi64(cent1, cent1);
-      __m256i d1 = _mm256_sub_epi32(ind1, cent);
-      __m256i d2 = _mm256_sub_epi32(ind2, cent);
-      __m256i d3 = _mm256_mullo_epi32(d1, d1);
-      __m256i d4 = _mm256_mullo_epi32(d2, d2);
-      __m256i d5 = _mm256_hadd_epi32(d3, d4);
-      dist[j] = _mm256_permutevar8x32_epi32(d5, v_permute);
+      const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+      const __m256i cent = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy,
+                                            cx, cy, cx, cy, cx, cy, cx);
+      const __m256i d1 = _mm256_sub_epi16(ind, cent);
+      dist[j] = _mm256_madd_epi16(d1, d1);
     }
 
-    __m256i ind = _mm256_setzero_si256();
+    ind = _mm256_setzero_si256();
     for (int j = 1; j < k; j++) {
       __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
-      __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
-      __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
-      dist[0] = _mm256_or_si256(dist1, dist2);
-      ind1 = _mm256_set1_epi32(j);
+      dist[0] = _mm256_min_epi32(dist[0], dist[j]);
+      const __m256i ind1 = _mm256_set1_epi32(j);
       ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
                             _mm256_and_si256(cmp, ind1));
     }
@@ -89,7 +104,18 @@ void av1_calc_indices_dim2_avx2(const int *data, const int *centroids,
 
     _mm_storel_epi64((__m128i *)indices, d1);
 
+    if (total_dist) {
+      // Convert to 64 bit and add to sum.
+      const __m256i dist1 = _mm256_unpacklo_epi32(dist[0], v_zero);
+      const __m256i dist2 = _mm256_unpackhi_epi32(dist[0], v_zero);
+      sum = _mm256_add_epi64(sum, dist1);
+      sum = _mm256_add_epi64(sum, dist2);
+    }
+
     indices += 8;
     data += 16;
   }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_avx2(sum);
+  }
 }
diff --git a/av1/encoder/x86/av1_k_means_sse2.c b/av1/encoder/x86/av1_k_means_sse2.c
index 43f661fda..a284fa9bf 100644
--- a/av1/encoder/x86/av1_k_means_sse2.c
+++ b/av1/encoder/x86/av1_k_means_sse2.c
@@ -14,68 +14,73 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/x86/synonyms.h"
 
-void av1_calc_indices_dim1_sse2(const int *data, const int *centroids,
-                                uint8_t *indices, int n, int k) {
+static int64_t k_means_horizontal_sum_sse2(__m128i a) {
+  const __m128i sum1 = _mm_unpackhi_epi64(a, a);
+  const __m128i sum2 = _mm_add_epi64(a, sum1);
+  int64_t res;
+  _mm_storel_epi64((__m128i *)&res, sum2);
+  return res;
+}
+
+void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
   const __m128i v_zero = _mm_setzero_si128();
-  int l = 1;
   __m128i dist[PALETTE_MAX_SIZE];
-  __m128i ind[2];
+  __m128i sum = _mm_setzero_si128();
 
-  for (int i = 0; i < n; i += 4) {
-    l = (l == 0) ? 1 : 0;
-    ind[l] = _mm_loadu_si128((__m128i *)data);
+  for (int i = 0; i < n; i += 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)data);
     for (int j = 0; j < k; j++) {
-      __m128i cent = _mm_set1_epi32((uint32_t)centroids[j]);
-      __m128i d1 = _mm_sub_epi32(ind[l], cent);
-      __m128i d2 = _mm_packs_epi32(d1, d1);
-      __m128i d3 = _mm_mullo_epi16(d2, d2);
-      __m128i d4 = _mm_mulhi_epi16(d2, d2);
-      dist[j] = _mm_unpacklo_epi16(d3, d4);
+      __m128i cent = _mm_set1_epi16(centroids[j]);
+      __m128i d1 = _mm_sub_epi16(in, cent);
+      __m128i d2 = _mm_sub_epi16(cent, in);
+      dist[j] = _mm_max_epi16(d1, d2);
     }
 
-    ind[l] = _mm_setzero_si128();
+    __m128i ind = _mm_setzero_si128();
     for (int j = 1; j < k; j++) {
-      __m128i cmp = _mm_cmpgt_epi32(dist[0], dist[j]);
-      __m128i dist1 = _mm_andnot_si128(cmp, dist[0]);
-      __m128i dist2 = _mm_and_si128(cmp, dist[j]);
-      dist[0] = _mm_or_si128(dist1, dist2);
-      __m128i ind1 = _mm_set1_epi32(j);
-      ind[l] =
-          _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
+      __m128i cmp = _mm_cmpgt_epi16(dist[0], dist[j]);
+      dist[0] = _mm_min_epi16(dist[0], dist[j]);
+      __m128i ind1 = _mm_set1_epi16(j);
+      ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1));
     }
-    ind[l] = _mm_packus_epi16(ind[l], v_zero);
-    if (l == 1) {
-      __m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
-      _mm_storel_epi64((__m128i *)indices, p2);
-      indices += 8;
+    if (total_dist) {
+      // Square, convert to 32 bit and add together.
+      dist[0] = _mm_madd_epi16(dist[0], dist[0]);
+      // Convert to 64 bit and add to sum.
+      const __m128i dist1 = _mm_unpacklo_epi32(dist[0], v_zero);
+      const __m128i dist2 = _mm_unpackhi_epi32(dist[0], v_zero);
+      sum = _mm_add_epi64(sum, dist1);
+      sum = _mm_add_epi64(sum, dist2);
     }
-    data += 4;
+    __m128i p2 = _mm_packus_epi16(ind, v_zero);
+    _mm_storel_epi64((__m128i *)indices, p2);
+    indices += 8;
+    data += 8;
+  }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_sse2(sum);
   }
 }
 
-void av1_calc_indices_dim2_sse2(const int *data, const int *centroids,
-                                uint8_t *indices, int n, int k) {
+void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids,
+                                uint8_t *indices, int64_t *total_dist, int n,
+                                int k) {
   const __m128i v_zero = _mm_setzero_si128();
   int l = 1;
   __m128i dist[PALETTE_MAX_SIZE];
   __m128i ind[2];
+  __m128i sum = _mm_setzero_si128();
 
   for (int i = 0; i < n; i += 4) {
     l = (l == 0) ? 1 : 0;
     __m128i ind1 = _mm_loadu_si128((__m128i *)data);
-    __m128i ind2 = _mm_loadu_si128((__m128i *)(data + 4));
-    __m128i indl = _mm_unpacklo_epi32(ind1, ind2);
-    __m128i indh = _mm_unpackhi_epi32(ind1, ind2);
-    ind1 = _mm_unpacklo_epi32(indl, indh);
-    ind2 = _mm_unpackhi_epi32(indl, indh);
     for (int j = 0; j < k; j++) {
-      __m128i cent0 = _mm_set1_epi32(centroids[2 * j]);
-      __m128i cent1 = _mm_set1_epi32(centroids[2 * j + 1]);
-      __m128i d1 = _mm_sub_epi32(ind1, cent0);
-      __m128i d2 = _mm_sub_epi32(ind2, cent1);
-      __m128i d3 = _mm_madd_epi16(d1, d1);
-      __m128i d4 = _mm_madd_epi16(d2, d2);
-      dist[j] = _mm_add_epi32(d3, d4);
+      const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+      const __m128i cent = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx);
+      const __m128i d1 = _mm_sub_epi16(ind1, cent);
+      dist[j] = _mm_madd_epi16(d1, d1);
     }
 
     ind[l] = _mm_setzero_si128();
@@ -89,6 +94,13 @@ void av1_calc_indices_dim2_sse2(const int *data, const int *centroids,
           _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
     }
     ind[l] = _mm_packus_epi16(ind[l], v_zero);
+    if (total_dist) {
+      // Convert to 64 bit and add to sum.
+      const __m128i dist1 = _mm_unpacklo_epi32(dist[0], v_zero);
+      const __m128i dist2 = _mm_unpackhi_epi32(dist[0], v_zero);
+      sum = _mm_add_epi64(sum, dist1);
+      sum = _mm_add_epi64(sum, dist2);
+    }
     if (l == 1) {
       __m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
       _mm_storel_epi64((__m128i *)indices, p2);
@@ -96,4 +108,7 @@ void av1_calc_indices_dim2_sse2(const int *data, const int *centroids,
     }
     data += 8;
   }
+  if (total_dist) {
+    *total_dist = k_means_horizontal_sum_sse2(sum);
+  }
 }
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 8c4e39583..75c5172f8 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -98,6 +98,28 @@ static INLINE int16_t accumulate_eob256(__m256i eob256) {
   return _mm_extract_epi16(eob, 1);
 }
 
+static AOM_FORCE_INLINE void quantize_lp_16_first(
+    const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr,
+    int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256,
+    __m256i *dequant256, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff);
+
+  const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+  *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
 static AOM_FORCE_INLINE void quantize_lp_16(
     const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr,
     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256,
@@ -143,29 +165,21 @@ void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
   quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
   dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
 
-  coeff_ptr += n_coeffs;
-  iscan += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-
   // Process DC and the first 15 AC coeffs.
-  quantize_lp_16(coeff_ptr, n_coeffs, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
-                 &quant256, &dequant256, &eob256);
-
-  // Overwrite the DC constants with AC constants
-  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
-  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
-  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
-
-  n_coeffs += 8 * 2;
-
-  // AC only loop.
-  while (n_coeffs < 0) {
-    quantize_lp_16(coeff_ptr, n_coeffs, iscan, qcoeff_ptr, dqcoeff_ptr,
-                   &round256, &quant256, &dequant256, &eob256);
-
-    n_coeffs += 8 * 2;
+  quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+                       &quant256, &dequant256, &eob256);
+
+  if (n_coeffs > 16) {
+    // Overwrite the DC constants with AC constants
+    dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+    quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+    round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+    // AC only loop.
+    for (int idx = 16; idx < n_coeffs; idx += 16) {
+      quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+                     &quant256, &dequant256, &eob256);
+    }
   }
 
   *eob_ptr = accumulate_eob256(eob256);
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index 12dda3ad0..57725d179 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -29,53 +29,122 @@ static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
   }
 }
 
-int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
-                                intptr_t block_size) {
+static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff,
+                                                    const int16_t *dqcoeff,
+                                                    __m256i *sse_256) {
+  const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+  const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+  // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+  const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+  // r0 r1 r2 r3 r4 r5 r6 r7
+  const __m256i error = _mm256_madd_epi16(diff, diff);
+  // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7
+  const __m256i error_hi = _mm256_hadd_epi32(error, error);
+  // r0+r1 | r2+r3 | r4+r5 | r6+r7
+  *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256());
+}
+
+static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff,
+                                                    const int16_t *dqcoeff,
+                                                    __m256i *sse_256) {
   const __m256i zero = _mm256_setzero_si256();
-  __m256i sse_256 = zero;
-  __m256i sse_hi;
-  __m128i sse_128;
-  int64_t sse;
+  const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+  const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+  const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+  const __m256i _dqcoeff_1 =
+      _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+
+  // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+  const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+  const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+
+  // r0 r1 r2 r3 r4 r5 r6 r7
+  const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+  const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+  const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+
+  // For extreme input values, the accumulation needs to happen in 64 bit
+  // precision to avoid any overflow.
+  const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+  const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+  const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+  *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0);
+}
 
-  if (block_size == 16) {
-    // Load 16 elements for coeff and dqcoeff.
-    const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
-    const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
-    // dqcoeff - coeff
-    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
-    // madd (dqcoeff - coeff)
-    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
-    // Save the higher 64 bit of each 128 bit lane.
-    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
-    // Add the higher 64 bit to the low 64 bit.
-    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
-    // Expand each double word in the lower 64 bits to quad word.
-    sse_256 = _mm256_unpacklo_epi32(error, zero);
-  } else {
-    for (int i = 0; i < block_size; i += 16) {
-      // Load 16 elements for coeff and dqcoeff.
-      const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
-      const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
-      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
-      const __m256i error = _mm256_madd_epi16(diff, diff);
-      // Expand each double word of madd (dqcoeff - coeff) to quad word.
-      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
-      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
-      // Add each quad word of madd (dqcoeff - coeff).
-      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
-      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
-      coeff += 16;
-      dqcoeff += 16;
-    }
+static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff,
+                                                    const int16_t *dqcoeff,
+                                                    __m256i *sse_256,
+                                                    intptr_t num_coeff) {
+  const __m256i zero = _mm256_setzero_si256();
+  for (int i = 0; i < num_coeff; i += 64) {
+    // Load 64 elements for coeff and dqcoeff.
+    const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+    const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+    const __m256i _dqcoeff_1 =
+        _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+    const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32));
+    const __m256i _dqcoeff_2 =
+        _mm256_loadu_si256((const __m256i *)(dqcoeff + 32));
+    const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48));
+    const __m256i _dqcoeff_3 =
+        _mm256_loadu_si256((const __m256i *)(dqcoeff + 48));
+
+    // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+    const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+    const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+    const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2);
+    const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3);
+
+    // r0 r1 r2 r3 r4 r5 r6 r7
+    const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+    const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+    const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2);
+    const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3);
+    // r00 r01 r02 r03 r04 r05 r06 r07
+    const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+    // r10 r11 r12 r13 r14 r15 r16 r17
+    const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3);
+
+    // For extreme input values, the accumulation needs to happen in 64 bit
+    // precision to avoid any overflow. r00 r01 r04 r05
+    const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+    // r02 r03 r06 r07
+    const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+    // r10 r11 r14 r15
+    const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero);
+    // r12 r13 r16 r17
+    const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero);
+
+    const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+    const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo);
+    const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0);
+    *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp);
+    coeff += 64;
+    dqcoeff += 64;
   }
+}
+
+int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t num_coeff) {
+  assert(num_coeff % 16 == 0);
+  __m256i sse_256 = _mm256_setzero_si256();
+  int64_t sse;
+
+  if (num_coeff == 16)
+    av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256);
+  else if (num_coeff == 32)
+    av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256);
+  else
+    av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff);
+
   // Save the higher 64 bit of each 128 bit lane.
-  sse_hi = _mm256_srli_si256(sse_256, 8);
+  const __m256i sse_hi = _mm256_srli_si256(sse_256, 8);
   // Add the higher 64 bit to the low 64 bit.
   sse_256 = _mm256_add_epi64(sse_256, sse_hi);
-
-  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
-  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
-                          _mm256_extractf128_si256(sse_256, 1));
+  // Accumulate the sse_256 register to get final sse
+  const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                                        _mm256_extractf128_si256(sse_256, 1));
 
   // Store the results.
   _mm_storel_epi64((__m128i *)&sse, sse_128);
diff --git a/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
index 4579e4e4a..0287f01f3 100644
--- a/av1/encoder/x86/highbd_block_error_intrin_sse2.c
+++ b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -33,7 +33,7 @@ int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
     __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
     // Check if any values require more than 15 bit
     max = _mm_set1_epi32(0x3fff);
-    min = _mm_set1_epi32(0xffffc000);
+    min = _mm_set1_epi32((int)0xffffc000);
     cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
                          _mm_cmplt_epi32(mm_coeff, min));
     cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index d53b12856..e244d5ec3 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -59,7 +59,7 @@ static INLINE void acc_stat_win7_one_line_avx2(
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -88,7 +88,7 @@ static INLINE void acc_stat_win7_one_line_avx2(
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -260,7 +260,7 @@ static INLINE void acc_stat_highbd_win7_one_line_avx2(
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
         // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -297,7 +297,7 @@ static INLINE void acc_stat_highbd_win7_one_line_avx2(
         // interleaved copies of two pixels, but we only have one. However, the
         // pixels are (effectively) used as inputs to a multiply-accumulate. So
         // if we set the extra pixel slot to 0, then it is effectively ignored.
-        const __m256i dgd_ijkl = _mm256_set1_epi32((uint32_t)D1);
+        const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
                              &dgd_ijkl);
@@ -408,7 +408,7 @@ static INLINE void acc_stat_highbd_win5_one_line_avx2(
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
         // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -441,7 +441,7 @@ static INLINE void acc_stat_highbd_win5_one_line_avx2(
         // interleaved copies of two pixels, but we only have one. However, the
         // pixels are (effectively) used as inputs to a multiply-accumulate. So
         // if we set the extra pixel slot to 0, then it is effectively ignored.
-        const __m256i dgd_ijkl = _mm256_set1_epi32((uint32_t)D1);
+        const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
                              &dgd_ijkl);
@@ -569,7 +569,7 @@ static INLINE void acc_stat_win5_one_line_avx2(
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -596,7 +596,7 @@ static INLINE void acc_stat_win5_one_line_avx2(
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 3d496ef3c..8208cca63 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -62,7 +62,7 @@ static INLINE void acc_stat_win7_one_line_sse4_1(
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m128i kl =
-            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((int16_t *)(dgd_ijk + l))));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -91,7 +91,7 @@ static INLINE void acc_stat_win7_one_line_sse4_1(
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -265,7 +265,7 @@ static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
 
         // Load two u16 values from dgd as a single u32
         // Then broadcast to 4x u32 slots of a 128
-        const __m128i dgd_ijkl = _mm_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m128i dgd_ijkl = _mm_set1_epi32(*((int *)(dgd_ijk + l)));
         // dgd_ijkl = [y x y x y x y x] as u16
 
         acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -302,7 +302,7 @@ static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
         // interleaved copies of two pixels, but we only have one. However, the
         // pixels are (effectively) used as inputs to a multiply-accumulate. So
         // if we set the extra pixel slot to 0, then it is effectively ignored.
-        const __m128i dgd_ijkl = _mm_set1_epi32((uint32_t)D1);
+        const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
 
         acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
                               &dgd_ijkl);
@@ -414,7 +414,7 @@ static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
 
         // Load two u16 values from dgd as a single u32
         // then broadcast to 4x u32 slots of a 128
-        const __m128i dgd_ijkl = _mm_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m128i dgd_ijkl = _mm_set1_epi32(*((int *)(dgd_ijk + l)));
         // dgd_ijkl = [y x y x y x y x] as u16
 
         acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -447,7 +447,7 @@ static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
         // interleaved copies of two pixels, but we only have one. However, the
         // pixels are (effectively) used as inputs to a multiply-accumulate. So
         // if we set the extra pixel slot to 0, then it is effectively ignored.
-        const __m128i dgd_ijkl = _mm_set1_epi32((uint32_t)D1);
+        const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
 
         acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
                               &dgd_ijkl);
@@ -574,7 +574,7 @@ static INLINE void acc_stat_win5_one_line_sse4_1(
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m128i kl =
-            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((int16_t *)(dgd_ijk + l))));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -601,7 +601,7 @@ static INLINE void acc_stat_win5_one_line_sse4_1(
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index 3bc763c58..a0ab3940c 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -31,8 +31,8 @@ INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
   //                      [ m n o p ]
 
   const __m256i pixels = _mm256_set_epi64x(
-      loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]),
-      loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride]));
+      loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]),
+      loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride]));
   // pixels = [d c b a h g f e] [l k j i p o n m] as i16
 
   const __m256i slli = _mm256_slli_epi64(pixels, 16);
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 4c4ec1fa7..12ac14619 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -29,10 +29,10 @@ INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
   //                      [ i j k l ]
   //                      [ m n o p ]
 
-  const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride],
-                                         *(uint64_t *)&diff[2 * stride]);
-  const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride],
-                                         *(uint64_t *)&diff[3 * stride]);
+  const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride],
+                                         *(int64_t *)&diff[2 * stride]);
+  const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride],
+                                         *(int64_t *)&diff[3 * stride]);
   // pixelsa = [d c b a l k j i] as i16
   // pixelsb = [h g f e p o n m] as i16
 
diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
index 6455bf3d5..d33fec796 100644
--- a/av1/encoder/x86/reconinter_enc_sse2.c
+++ b/av1/encoder/x86/reconinter_enc_sse2.c
@@ -305,13 +305,12 @@ void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
   assert(!(width * height & 7));
   n = width * height >> 3;
 
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+  const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set1_epi16(wt0);
+  const __m128i w1 = _mm_set1_epi16(wt1);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
 
   uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
   for (i = 0; i < n; i++) {
diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
index 7ac0f0d03..df7aa9585 100644
--- a/av1/encoder/x86/reconinter_enc_ssse3.c
+++ b/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -48,13 +48,12 @@ void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
   assert(!(width * height & 15));
   n = width * height >> 4;
 
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+  const int8_t w1 = (int8_t)jcp_param->bck_offset;
   const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
                                  w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
 
   for (i = 0; i < n; i++) {
     __m128i p0 = xx_loadu_128(comp_pred);
diff --git a/av1/encoder/x86/wedge_utils_avx2.c b/av1/encoder/x86/wedge_utils_avx2.c
index c06bad8f7..bbc62d5f1 100644
--- a/av1/encoder/x86/wedge_utils_avx2.c
+++ b/av1/encoder/x86/wedge_utils_avx2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
-  const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
+  const __m256i v_zext_q = yy_set1_64_from_32i(~0);
 
   __m256i v_acc0_q = _mm256_setzero_si256();
 
@@ -142,7 +142,7 @@ int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
   v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
 
 #if ARCH_X86_64
-  acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+  acc = _mm_extract_epi64(v_acc_q_0, 0);
 #else
   xx_storel_64(&acc, v_acc_q_0);
 #endif
@@ -155,7 +155,7 @@ int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
  */
 void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
                                           const int16_t *b, int N) {
-  const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
+  const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001);
 
   assert(N % 64 == 0);
 
diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c
index f3f4b8a75..e665b2e36 100644
--- a/av1/encoder/x86/wedge_utils_sse2.c
+++ b/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
-  const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_q = xx_set1_64_from_32i(~0);
 
   __m128i v_acc0_q = _mm_setzero_si128();
 
@@ -175,7 +175,7 @@ int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
 
 #if ARCH_X86_64
-  acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+  acc = _mm_cvtsi128_si64(v_acc_q);
 #else
   xx_storel_64(&acc, v_acc_q);
 #endif
diff --git a/av1/ducky_encode.cc b/av1/qmode_rc/ducky_encode.cc
index 5bfc1249f..bd4b76696 100644
--- a/av1/ducky_encode.cc
+++ b/av1/qmode_rc/ducky_encode.cc
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "av1/common/enums.h"
+#include "av1/encoder/rd.h"
 #include "config/aom_config.h"
 
 #include "aom/aom_encoder.h"
@@ -26,13 +27,12 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/temporal_filter.h"
-#include "av1/ducky_encode.h"
+#include "av1/qmode_rc/ducky_encode.h"
 
 #include "common/tools_common.h"
 
 namespace aom {
 struct EncoderResource {
-  FILE *in_file;
   STATS_BUFFER_CTX *stats_buf_ctx;
   FIRSTPASS_STATS *stats_buffer;
   aom_image_t img;
@@ -47,25 +47,31 @@ class DuckyEncode::EncodeImpl {
   int g_usage;
   int max_ref_frames;
   int speed;
+  int base_qindex;
+  BLOCK_SIZE sb_size;
   enum aom_rc_mode rc_end_usage;
   aom_rational64_t timestamp_ratio;
   std::vector<FIRSTPASS_STATS> stats_list;
   EncoderResource enc_resource;
+  struct AvxInputContext input;
 };
 
-DuckyEncode::DuckyEncode(const VideoInfo &video_info, int max_ref_frames,
-                         int speed) {
+DuckyEncode::DuckyEncode(const VideoInfo &video_info, BLOCK_SIZE sb_size,
+                         int max_ref_frames, int speed, int base_qindex) {
   impl_ptr_ = std::unique_ptr<EncodeImpl>(new EncodeImpl());
   impl_ptr_->video_info = video_info;
   impl_ptr_->g_usage = GOOD;
   impl_ptr_->max_ref_frames = max_ref_frames;
   impl_ptr_->speed = speed;
+  impl_ptr_->base_qindex = base_qindex;
+  impl_ptr_->sb_size = sb_size;
   impl_ptr_->rc_end_usage = AOM_Q;
   // TODO(angiebird): Set timestamp_ratio properly
   // timestamp_ratio.den = cfg->g_timebase.den;
   // timestamp_ratio.num = (int64_t)cfg->g_timebase.num * TICKS_PER_SEC;
   impl_ptr_->timestamp_ratio = { 1, 1 };
   // TODO(angiebird): How to set ptsvol and duration?
+  impl_ptr_->input.filename = impl_ptr_->video_info.file_path.c_str();
 }
 
 DuckyEncode::~DuckyEncode() {}
@@ -81,6 +87,10 @@ static AV1EncoderConfig GetEncoderConfig(const VideoInfo &video_info,
   // g_timebase is the inverse of frame_rate
   cfg.g_timebase.num = video_info.frame_rate.den;
   cfg.g_timebase.den = video_info.frame_rate.num;
+  if (pass == AOM_RC_SECOND_PASS) {
+    cfg.rc_twopass_stats_in.sz =
+        (video_info.frame_count + 1) * sizeof(FIRSTPASS_STATS);
+  }
   AV1EncoderConfig oxcf = av1_get_encoder_config(&cfg);
   // TODO(angiebird): Why didn't we init use_highbitdepth in
   // av1_get_encoder_config()?
@@ -106,8 +116,10 @@ static STATS_BUFFER_CTX *CreateStatsBufferCtx(int frame_count,
   stats_buf_ctx->total_stats = stats_buf_ctx->stats_in_buf_end;
   stats_buf_ctx->total_left_stats =
       stats_buf_ctx->stats_in_start + frame_count + 1;
-  av1_twopass_zero_stats(stats_buf_ctx->total_left_stats);
-  av1_twopass_zero_stats(stats_buf_ctx->total_stats);
+  for (FIRSTPASS_STATS *buffer = stats_buf_ctx->stats_in_start;
+       buffer <= stats_buf_ctx->total_left_stats; ++buffer) {
+    av1_twopass_zero_stats(buffer);
+  }
   return stats_buf_ctx;
 }
 
@@ -133,21 +145,66 @@ static FIRSTPASS_STATS ComputeTotalStats(
   return total_stats;
 }
 
-static EncoderResource InitEncoder(
-    const VideoInfo &video_info, int g_usage, enum aom_rc_mode rc_end_usage,
-    aom_enc_pass pass, const std::vector<FIRSTPASS_STATS> *stats_list,
-    int max_ref_frames, int speed) {
+static bool FileIsY4m(const char detect[4]) {
+  return memcmp(detect, "YUV4", 4) == 0;
+}
+
+static bool FourccIsIvf(const char detect[4]) {
+  return memcmp(detect, "DKIF", 4) == 0;
+}
+
+static void OpenInputFile(struct AvxInputContext *input) {
+  input->file = fopen(input->filename, "rb");
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+  aom_chroma_sample_position_t const csp = AOM_CSP_UNKNOWN;
+  if (input->detect.buf_read == 4 && FileIsY4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.aom_fmt;
+      input->bit_depth = static_cast<aom_bit_depth_t>(input->y4m.bit_depth);
+      input->color_range = input->y4m.color_range;
+    } else
+      fatal("Unsupported Y4M stream.");
+  } else if (input->detect.buf_read == 4 && FourccIsIvf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+void DuckyEncode::InitEncoder(aom_enc_pass pass,
+                              const std::vector<FIRSTPASS_STATS> *stats_list) {
   EncoderResource enc_resource = {};
-  enc_resource.in_file = fopen(video_info.file_path.c_str(), "r");
   enc_resource.lookahead_push_count = 0;
-  aom_img_alloc(&enc_resource.img, video_info.img_fmt, video_info.frame_width,
-                video_info.frame_height, /*align=*/1);
-  AV1EncoderConfig oxcf = GetEncoderConfig(video_info, g_usage, pass);
+  OpenInputFile(&impl_ptr_->input);
+  if (impl_ptr_->input.file_type != FILE_TYPE_Y4M) {
+    aom_img_alloc(&enc_resource.img, impl_ptr_->video_info.img_fmt,
+                  impl_ptr_->video_info.frame_width,
+                  impl_ptr_->video_info.frame_height, /*align=*/1);
+  }
+  AV1EncoderConfig oxcf =
+      GetEncoderConfig(impl_ptr_->video_info, impl_ptr_->g_usage, pass);
   oxcf.dec_model_cfg.decoder_model_info_present_flag = 0;
   oxcf.dec_model_cfg.display_model_info_present_flag = 0;
-  oxcf.ref_frm_cfg.max_reference_frames = max_ref_frames;
-  oxcf.speed = speed;
-  av1_initialize_enc(g_usage, rc_end_usage);
+  oxcf.ref_frm_cfg.max_reference_frames = impl_ptr_->max_ref_frames;
+  oxcf.speed = impl_ptr_->speed;
+  if (impl_ptr_->sb_size == BLOCK_64X64)
+    oxcf.tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+  else
+    oxcf.tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_128X128;
+
+  av1_initialize_enc(impl_ptr_->g_usage, impl_ptr_->rc_end_usage);
   AV1_PRIMARY *ppi =
       av1_create_primary_compressor(nullptr,
                                     /*num_lap_buffers=*/0, &oxcf);
@@ -162,8 +219,8 @@ static EncoderResource InitEncoder(
 
   aom_codec_err_t res = AOM_CODEC_OK;
   (void)res;
-  enc_resource.stats_buf_ctx =
-      CreateStatsBufferCtx(video_info.frame_count, &enc_resource.stats_buffer);
+  enc_resource.stats_buf_ctx = CreateStatsBufferCtx(
+      impl_ptr_->video_info.frame_count, &enc_resource.stats_buffer);
   if (pass == AOM_RC_SECOND_PASS) {
     assert(stats_list != nullptr);
     std::copy(stats_list->begin(), stats_list->end(),
@@ -172,8 +229,8 @@ static EncoderResource InitEncoder(
     oxcf.twopass_stats_in.buf = enc_resource.stats_buffer;
     // We need +1 here because av1 encoder assumes
     // oxcf.twopass_stats_in.buf[video_info.frame_count] has the total_stats
-    oxcf.twopass_stats_in.sz =
-        (video_info.frame_count + 1) * sizeof(enc_resource.stats_buffer[0]);
+    oxcf.twopass_stats_in.sz = (impl_ptr_->video_info.frame_count + 1) *
+                               sizeof(enc_resource.stats_buffer[0]);
   } else {
     assert(pass == AOM_RC_FIRST_PASS);
     // We don't use stats_list for AOM_RC_FIRST_PASS.
@@ -196,7 +253,7 @@ static EncoderResource InitEncoder(
   assert(buffer_pool != nullptr);
   const AV1_COMP *cpi = ppi->cpi;
   SequenceHeader *seq_params = ppi->cpi->common.seq_params;
-
+  set_sb_size(seq_params, impl_ptr_->sb_size);
   ppi->seq_params_locked = 1;
   assert(ppi->lookahead == nullptr);
 
@@ -211,12 +268,18 @@ static EncoderResource InitEncoder(
 
   av1_tf_info_alloc(&cpi->ppi->tf_info, cpi);
   assert(ppi->lookahead != nullptr);
-  return enc_resource;
+
+  impl_ptr_->enc_resource = enc_resource;
 }
 
-static void FreeEncoder(EncoderResource *enc_resource) {
-  fclose(enc_resource->in_file);
-  enc_resource->in_file = nullptr;
+static void CloseInputFile(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+void DuckyEncode::FreeEncoder() {
+  EncoderResource *enc_resource = &impl_ptr_->enc_resource;
+  CloseInputFile(&impl_ptr_->input);
   aom_img_free(&enc_resource->img);
   DestroyStatsBufferCtx(&enc_resource->stats_buf_ctx,
                         &enc_resource->stats_buffer);
@@ -226,30 +289,42 @@ static void FreeEncoder(EncoderResource *enc_resource) {
   enc_resource->ppi = nullptr;
 }
 
+static int ReadFrame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
 std::vector<FIRSTPASS_STATS> DuckyEncode::ComputeFirstPassStats() {
   aom_enc_pass pass = AOM_RC_FIRST_PASS;
-  EncoderResource enc_resource = InitEncoder(
-      impl_ptr_->video_info, impl_ptr_->g_usage, impl_ptr_->rc_end_usage, pass,
-      nullptr, impl_ptr_->max_ref_frames, impl_ptr_->speed);
-  AV1_PRIMARY *ppi = enc_resource.ppi;
+  InitEncoder(pass, nullptr);
+  AV1_PRIMARY *ppi = impl_ptr_->enc_resource.ppi;
+  EncoderResource *enc_resource = &impl_ptr_->enc_resource;
   struct lookahead_ctx *lookahead = ppi->lookahead;
   int frame_count = impl_ptr_->video_info.frame_count;
-  FILE *in_file = enc_resource.in_file;
   aom_rational64_t timestamp_ratio = impl_ptr_->timestamp_ratio;
   // TODO(angiebird): Ideally, ComputeFirstPassStats() doesn't output
   // bitstream. Do we need bitstream buffer here?
   std::vector<uint8_t> buf(1000);
   std::vector<FIRSTPASS_STATS> stats_list;
   for (int i = 0; i < frame_count; ++i) {
-    if (aom_img_read(&enc_resource.img, in_file)) {
+    if (ReadFrame(&impl_ptr_->input, &impl_ptr_->enc_resource.img)) {
       // TODO(angiebird): Set ts_start/ts_end properly
-      int64_t ts_start = enc_resource.lookahead_push_count;
+      int64_t ts_start = enc_resource->lookahead_push_count;
       int64_t ts_end = ts_start + 1;
       YV12_BUFFER_CONFIG sd;
-      image2yuvconfig(&enc_resource.img, &sd);
+      image2yuvconfig(&enc_resource->img, &sd);
       av1_lookahead_push(lookahead, &sd, ts_start, ts_end,
                          /*use_highbitdepth=*/0, /*flags=*/0);
-      ++enc_resource.lookahead_push_count;
+      ++enc_resource->lookahead_push_count;
       AV1_COMP_DATA cpi_data = {};
       cpi_data.cx_data = buf.data();
       cpi_data.cx_data_sz = buf.size();
@@ -270,21 +345,20 @@ std::vector<FIRSTPASS_STATS> DuckyEncode::ComputeFirstPassStats() {
   }
   av1_end_first_pass(ppi->cpi);
 
-  FreeEncoder(&enc_resource);
+  FreeEncoder();
   return stats_list;
 }
 
 void DuckyEncode::StartEncode(const std::vector<FIRSTPASS_STATS> &stats_list) {
   aom_enc_pass pass = AOM_RC_SECOND_PASS;
   impl_ptr_->stats_list = stats_list;
-  impl_ptr_->enc_resource = InitEncoder(
-      impl_ptr_->video_info, impl_ptr_->g_usage, impl_ptr_->rc_end_usage, pass,
-      &stats_list, impl_ptr_->max_ref_frames, impl_ptr_->speed);
+  InitEncoder(pass, &stats_list);
   write_temp_delimiter_ = true;
 }
 
 static void DuckyEncodeInfoSetGopStruct(AV1_PRIMARY *ppi,
-                                        GopStruct gop_struct) {
+                                        const GopStruct &gop_struct,
+                                        const GopEncodeInfo &gop_encode_info) {
   GF_GROUP *gf_group = &ppi->gf_group;
   ppi->p_rc.baseline_gf_interval = gop_struct.show_frame_count;
   ppi->internal_altref_allowed = 1;
@@ -299,6 +373,9 @@ static void DuckyEncodeInfoSetGopStruct(AV1_PRIMARY *ppi,
 
     gf_group->frame_type[i] = !frame.is_key_frame;
 
+    gf_group->q_val[i] = gop_encode_info.param_list[i].q_index;
+    gf_group->rdmult_val[i] = gop_encode_info.param_list[i].rdmult;
+
     gf_group->cur_frame_idx[i] = 0;
     gf_group->arf_src_offset[i] = frame.order_idx - frame.display_idx;
     gf_group->cur_frame_idx[i] = frame.display_idx;
@@ -330,10 +407,25 @@ static void DuckyEncodeInfoSetGopStruct(AV1_PRIMARY *ppi,
 static void DuckyEncodeInfoSetEncodeFrameDecision(
     DuckyEncodeInfo *ducky_encode_info, const EncodeFrameDecision &decision) {
   DuckyEncodeFrameInfo *frame_info = &ducky_encode_info->frame_info;
+  *frame_info = {};
   frame_info->qp_mode = static_cast<DUCKY_ENCODE_FRAME_MODE>(decision.qp_mode);
   frame_info->gop_mode = static_cast<DUCKY_ENCODE_GOP_MODE>(decision.gop_mode);
   frame_info->q_index = decision.parameters.q_index;
   frame_info->rdmult = decision.parameters.rdmult;
+  const size_t num_superblocks =
+      decision.parameters.superblock_encode_params.size();
+  frame_info->delta_q_enabled = 0;
+  if (num_superblocks > 1) {
+    frame_info->delta_q_enabled = 1;
+    frame_info->superblock_encode_qindex = new int[num_superblocks];
+    frame_info->superblock_encode_rdmult = new int[num_superblocks];
+    for (size_t i = 0; i < num_superblocks; ++i) {
+      frame_info->superblock_encode_qindex[i] =
+          decision.parameters.superblock_encode_params[i].q_index;
+      frame_info->superblock_encode_rdmult[i] =
+          decision.parameters.superblock_encode_params[i].rdmult;
+    }
+  }
 }
 
 static void DuckyEncodeInfoGetEncodeFrameResult(
@@ -370,7 +462,8 @@ static void WriteObu(AV1_PRIMARY *ppi, AV1_COMP_DATA *cpi_data) {
       obu_header_size + obu_payload_size + length_field_size;
 }
 
-TplGopStats DuckyEncode::ObtainTplStats(const GopStruct gop_struct) {
+TplGopStats DuckyEncode::ObtainTplStats(const GopStruct gop_struct,
+                                        bool rate_dist_present) {
   TplGopStats tpl_gop_stats;
 
   AV1_PRIMARY *ppi = impl_ptr_->enc_resource.ppi;
@@ -378,6 +471,8 @@ TplGopStats DuckyEncode::ObtainTplStats(const GopStruct gop_struct) {
 
   for (size_t idx = 0; idx < gop_struct.gop_frame_list.size(); ++idx) {
     TplFrameStats tpl_frame_stats = {};
+    tpl_frame_stats.rate_dist_present = rate_dist_present;
+
     TplDepFrame *tpl_frame = &ppi->tpl_data.tpl_frame[idx];
     if (gop_struct.gop_frame_list[idx].update_type == GopFrameType::kOverlay ||
         gop_struct.gop_frame_list[idx].update_type ==
@@ -412,8 +507,21 @@ TplGopStats DuckyEncode::ObtainTplStats(const GopStruct gop_struct) {
         block_stats.col = mi_col * MI_SIZE;
         block_stats.height = (1 << block_mis_log2) * MI_SIZE;
         block_stats.width = (1 << block_mis_log2) * MI_SIZE;
-        block_stats.inter_cost = tpl_stats_ptr->inter_cost;
-        block_stats.intra_cost = tpl_stats_ptr->intra_cost;
+
+        block_stats.inter_cost =
+            RDCOST(tpl_frame->base_rdmult, tpl_stats_ptr->recrf_rate,
+                   tpl_stats_ptr->recrf_dist);
+        block_stats.intra_cost =
+            RDCOST(tpl_frame->base_rdmult, tpl_stats_ptr->intra_rate,
+                   tpl_stats_ptr->intra_dist);
+
+        if (tpl_frame_stats.rate_dist_present) {
+          block_stats.recrf_dist = tpl_stats_ptr->recrf_dist;
+          block_stats.recrf_rate = tpl_stats_ptr->recrf_rate;
+          block_stats.intra_pred_err = tpl_stats_ptr->intra_sse;
+          block_stats.inter_pred_err = tpl_stats_ptr->recrf_sse;
+        }
+
         block_stats.ref_frame_index = { -1, -1 };
 
         for (int i = 0; i < kBlockRefCount; ++i) {
@@ -437,8 +545,12 @@ TplGopStats DuckyEncode::ObtainTplStats(const GopStruct gop_struct) {
 }
 
 // Obtain TPL stats through ducky_encode.
+// TODO(jianj): Populate rate_dist_present flag through qmode_rc_encoder
 std::vector<TplGopStats> DuckyEncode::ComputeTplStats(
-    const GopStructList &gop_list) {
+    const std::vector<FIRSTPASS_STATS> &stats_list,
+    const GopStructList &gop_list,
+    const GopEncodeInfoList &gop_encode_info_list) {
+  StartEncode(stats_list);
   std::vector<TplGopStats> tpl_gop_stats_list;
   AV1_PRIMARY *ppi = impl_ptr_->enc_resource.ppi;
   const VideoInfo &video_info = impl_ptr_->video_info;
@@ -448,28 +560,54 @@ std::vector<TplGopStats> DuckyEncode::ComputeTplStats(
   // Go through each gop and encode each frame in the gop
   for (size_t i = 0; i < gop_list.size(); ++i) {
     const aom::GopStruct &gop_struct = gop_list[i];
-    DuckyEncodeInfoSetGopStruct(ppi, gop_struct);
+    const aom::GopEncodeInfo &gop_encode_info = gop_encode_info_list[i];
+
+    DuckyEncodeInfoSetGopStruct(ppi, gop_struct, gop_encode_info);
 
     aom::TplGopStats tpl_gop_stats;
-    for (auto &frame : gop_struct.gop_frame_list) {
+    for (auto &frame_param : gop_encode_info.param_list) {
       // encoding frame frame_number
       aom::EncodeFrameDecision frame_decision = { aom::EncodeFrameMode::kQindex,
                                                   aom::EncodeGopMode::kGopRcl,
-                                                  { 128, -1 } };
-      (void)frame;
+                                                  frame_param };
       EncodeFrame(frame_decision);
       if (ppi->cpi->common.show_frame) pending_ctx_size_ = 0;
       write_temp_delimiter_ = ppi->cpi->common.show_frame;
     }
-    tpl_gop_stats = ObtainTplStats(gop_struct);
-    // TODO(jingning): Set the tpl stats file format and populate the stats.
+    // The rate_dist_present needs to be populated.
+    tpl_gop_stats = ObtainTplStats(gop_struct, 0);
     tpl_gop_stats_list.push_back(tpl_gop_stats);
   }
-
+  EndEncode();
   return tpl_gop_stats_list;
 }
 
-// Obtain TPL stats through ducky_encode.
+std::vector<TplGopStats> DuckyEncode::ComputeTwoPassTplStats(
+    const std::vector<FIRSTPASS_STATS> &stats_list,
+    const GopStructList &gop_list,
+    const GopEncodeInfoList &gop_encode_info_list,
+    const GopEncodeInfoList &alt_gop_encode_info_list) {
+  std::vector<TplGopStats> first_tpl_gop_stats_list =
+      ComputeTplStats(stats_list, gop_list, gop_encode_info_list);
+  const std::vector<TplGopStats> second_tpl_gop_stats_list =
+      ComputeTplStats(stats_list, gop_list, alt_gop_encode_info_list);
+  assert(first_tpl_gop_stats_list.size() == second_tpl_gop_stats_list.size());
+
+  // Set alternate_block_stats_list in first_tpl_gop_stats_list
+  // and return first_tpl_gop_stats_list
+  for (size_t i = 0; i < first_tpl_gop_stats_list.size(); ++i) {
+    for (size_t j = 0; j < first_tpl_gop_stats_list[i].frame_stats_list.size();
+         ++j) {
+      first_tpl_gop_stats_list[i]
+          .frame_stats_list[j]
+          .alternate_block_stats_list =
+          second_tpl_gop_stats_list[i].frame_stats_list[j].block_stats_list;
+    }
+  }
+  return first_tpl_gop_stats_list;
+}
+
+// Conduct final encoding process.
 std::vector<EncodeFrameResult> DuckyEncode::EncodeVideo(
     const GopStructList &gop_list,
     const GopEncodeInfoList &gop_encode_info_list) {
@@ -483,17 +621,17 @@ std::vector<EncodeFrameResult> DuckyEncode::EncodeVideo(
   // Go through each gop and encode each frame in the gop
   for (size_t i = 0; i < gop_list.size(); ++i) {
     const aom::GopStruct &gop_struct = gop_list[i];
-    DuckyEncodeInfoSetGopStruct(ppi, gop_struct);
-    aom::GopEncodeInfo gop_encode_info = gop_encode_info_list[i];
+    const aom::GopEncodeInfo &gop_encode_info = gop_encode_info_list[i];
+    DuckyEncodeInfoSetGopStruct(ppi, gop_struct, gop_encode_info);
 
     for (auto &frame_param : gop_encode_info.param_list) {
       aom::EncodeFrameDecision frame_decision = { aom::EncodeFrameMode::kQindex,
                                                   aom::EncodeGopMode::kGopRcl,
                                                   frame_param };
-      EncodeFrame(frame_decision);
+      EncodeFrameResult temp_result = EncodeFrame(frame_decision);
       if (ppi->cpi->common.show_frame) {
         bitstream_buf_.resize(pending_ctx_size_);
-        EncodeFrameResult encode_frame_result = {};
+        EncodeFrameResult encode_frame_result = temp_result;
         encode_frame_result.bitstream_buf = bitstream_buf_;
         encoded_frame_list.push_back(encode_frame_result);
 
@@ -513,11 +651,10 @@ EncodeFrameResult DuckyEncode::EncodeFrame(
   AV1_PRIMARY *ppi = impl_ptr_->enc_resource.ppi;
   aom_image_t *img = &impl_ptr_->enc_resource.img;
   AV1_COMP *const cpi = ppi->cpi;
-  FILE *in_file = impl_ptr_->enc_resource.in_file;
   struct lookahead_ctx *lookahead = ppi->lookahead;
 
   while (!av1_lookahead_full(lookahead)) {
-    if (aom_img_read(img, in_file)) {
+    if (ReadFrame(&impl_ptr_->input, img)) {
       YV12_BUFFER_CONFIG sd;
       image2yuvconfig(img, &sd);
       int64_t ts_start = impl_ptr_->enc_resource.lookahead_push_count;
@@ -565,10 +702,12 @@ EncodeFrameResult DuckyEncode::EncodeFrame(
   fprintf(stderr, "frame %d, qp = %d, size %d, PSNR %f\n",
           encode_frame_result.global_order_idx, encode_frame_result.q_index,
           encode_frame_result.rate, encode_frame_result.psnr);
+  delete[] cpi->ducky_encode_info.frame_info.superblock_encode_qindex;
+  delete[] cpi->ducky_encode_info.frame_info.superblock_encode_rdmult;
   return encode_frame_result;
 }
 
-void DuckyEncode::EndEncode() { FreeEncoder(&impl_ptr_->enc_resource); }
+void DuckyEncode::EndEncode() { FreeEncoder(); }
 
 void DuckyEncode::AllocateBitstreamBuffer(const VideoInfo &video_info) {
   pending_ctx_size_ = 0;
diff --git a/av1/ducky_encode.h b/av1/qmode_rc/ducky_encode.h
index ffa53b08f..5dee2a555 100644
--- a/av1/ducky_encode.h
+++ b/av1/qmode_rc/ducky_encode.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_DUCKY_ENCODE_H_
-#define AOM_AV1_DUCKY_ENCODE_H_
+#ifndef AOM_AV1_QMODE_RC_DUCKY_ENCODE_H_
+#define AOM_AV1_QMODE_RC_DUCKY_ENCODE_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -20,7 +20,7 @@
 
 #include "aom/aom_encoder.h"
 #include "av1/encoder/firstpass.h"
-#include "av1/ratectrl_qmode_interface.h"
+#include "av1/qmode_rc/ratectrl_qmode_interface.h"
 
 namespace aom {
 struct VideoInfo {
@@ -73,13 +73,26 @@ using GopEncodeInfoList = std::vector<GopEncodeInfo>;
 // invalid.
 class DuckyEncode {
  public:
-  explicit DuckyEncode(const VideoInfo &video_info, int max_ref_frames,
-                       int speed = 3);
+  explicit DuckyEncode(const VideoInfo &video_info, BLOCK_SIZE sb_size,
+                       int max_ref_frames, int speed, int base_qindex);
   ~DuckyEncode();
   std::vector<FIRSTPASS_STATS> ComputeFirstPassStats();
   void StartEncode(const std::vector<FIRSTPASS_STATS> &stats_list);
-  TplGopStats ObtainTplStats(const GopStruct gop_struct);
-  std::vector<TplGopStats> ComputeTplStats(const GopStructList &gop_list);
+
+  TplGopStats ObtainTplStats(const GopStruct gop_struct,
+                             bool rate_dist_present);
+
+  std::vector<TplGopStats> ComputeTplStats(
+      const std::vector<FIRSTPASS_STATS> &stats_list,
+      const GopStructList &gop_list,
+      const GopEncodeInfoList &gop_encode_info_list);
+
+  std::vector<TplGopStats> ComputeTwoPassTplStats(
+      const std::vector<FIRSTPASS_STATS> &stats_list,
+      const GopStructList &gop_list,
+      const GopEncodeInfoList &gop_encode_info_list,
+      const GopEncodeInfoList &alt_gop_encode_info_list);
+
   std::vector<EncodeFrameResult> EncodeVideo(
       const GopStructList &gop_list,
       const GopEncodeInfoList &gop_encode_info_list);
@@ -88,6 +101,11 @@ class DuckyEncode {
   void AllocateBitstreamBuffer(const VideoInfo &video_info);
 
  private:
+  void InitEncoder(aom_enc_pass pass,
+                   const std::vector<FIRSTPASS_STATS> *stats_list);
+  void FreeEncoder();
+
+ private:
   class EncodeImpl;
   std::unique_ptr<EncodeImpl> impl_ptr_;
   bool write_temp_delimiter_;
@@ -96,4 +114,4 @@ class DuckyEncode {
 };
 }  // namespace aom
 
-#endif  // AOM_AV1_DUCKY_ENCODE_H_
+#endif  // AOM_AV1_QMODE_RC_DUCKY_ENCODE_H_
diff --git a/av1/ratectrl_qmode.cc b/av1/qmode_rc/ratectrl_qmode.cc
index 8edefd77a..0a2892d89 100644
--- a/av1/ratectrl_qmode.cc
+++ b/av1/qmode_rc/ratectrl_qmode.cc
@@ -8,7 +8,7 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "av1/ratectrl_qmode.h"
+#include "av1/qmode_rc/ratectrl_qmode.h"
 
 #include <algorithm>
 #include <cassert>
@@ -16,6 +16,8 @@
 #include <functional>
 #include <numeric>
 #include <sstream>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "aom/aom_codec.h"
@@ -68,7 +70,7 @@ void SetGopFrameByType(GopFrameType gop_frame_type, GopFrame *gop_frame) {
       gop_frame->is_key_frame = 0;
       gop_frame->is_arf_frame = 1;
       gop_frame->is_show_frame = 0;
-      gop_frame->is_golden_frame = 0;
+      gop_frame->is_golden_frame = gop_frame->layer_depth <= 2 ? 1 : 0;
       gop_frame->encode_ref_mode = EncodeRefMode::kRegular;
       break;
     case GopFrameType::kRegularLeaf:
@@ -106,10 +108,10 @@ GopFrame GopFrameBasic(int global_coding_idx_offset,
   gop_frame.display_idx = display_idx;
   gop_frame.global_coding_idx = global_coding_idx_offset + coding_idx;
   gop_frame.global_order_idx = global_order_idx_offset + order_idx;
-  SetGopFrameByType(gop_frame_type, &gop_frame);
+  gop_frame.layer_depth = depth + kLayerDepthOffset;
   gop_frame.colocated_ref_idx = -1;
   gop_frame.update_ref_idx = -1;
-  gop_frame.layer_depth = depth + kLayerDepthOffset;
+  SetGopFrameByType(gop_frame_type, &gop_frame);
   return gop_frame;
 }
 
@@ -201,13 +203,13 @@ GopStruct ConstructGop(RefFrameManager *ref_frame_manager, int show_frame_count,
     ref_frame_manager->UpdateRefFrameTable(&gop_frame);
     gop_struct.gop_frame_list.push_back(gop_frame);
     ConstructGopMultiLayer(&gop_struct, ref_frame_manager,
-                           ref_frame_manager->ForwardMaxSize(), arf_depth + 1,
+                           ref_frame_manager->MaxRefFrame() - 1, arf_depth + 1,
                            order_start, order_end);
     // Overlay
     gop_frame =
         GopFrameBasic(global_coding_idx_offset, global_order_idx_offset,
                       static_cast<int>(gop_struct.gop_frame_list.size()),
-                      order_end, ref_frame_manager->ForwardMaxSize(),
+                      order_end, ref_frame_manager->MaxRefFrame() - 1,
                       gop_struct.display_tracker, GopFrameType::kOverlay);
     ref_frame_manager->UpdateRefFrameTable(&gop_frame);
     gop_struct.gop_frame_list.push_back(gop_frame);
@@ -249,11 +251,6 @@ Status AV1RateControlQMode::SetRcParam(const RateControlParam &rc_param) {
                   << ") must be in the range [1, 7].";
     return { AOM_CODEC_INVALID_PARAM, error_message.str() };
   }
-  if (rc_param.max_depth < 1 || rc_param.max_depth > 5) {
-    error_message << "max_depth (" << rc_param.max_depth
-                  << ") must be in the range [1, 5].";
-    return { AOM_CODEC_INVALID_PARAM, error_message.str() };
-  }
   if (rc_param.base_q_index < 0 || rc_param.base_q_index > 255) {
     error_message << "base_q_index (" << rc_param.base_q_index
                   << ") must be in the range [0, 255].";
@@ -854,11 +851,21 @@ StatusOr<GopStructList> AV1RateControlQMode::DetermineGopInfo(
     const FirstpassInfo &firstpass_info) {
   const int stats_size = static_cast<int>(firstpass_info.stats_list.size());
   GopStructList gop_list;
-  RefFrameManager ref_frame_manager(rc_param_.ref_frame_table_size);
+  RefFrameManager ref_frame_manager(rc_param_.ref_frame_table_size,
+                                    rc_param_.max_ref_frames);
+
+  // Make a copy of the first pass stats, and analyze them
+  FirstpassInfo fp_info_copy = firstpass_info;
+  av1_mark_flashes(fp_info_copy.stats_list.data(),
+                   fp_info_copy.stats_list.data() + stats_size);
+  av1_estimate_noise(fp_info_copy.stats_list.data(),
+                     fp_info_copy.stats_list.data() + stats_size);
+  av1_estimate_coeff(fp_info_copy.stats_list.data(),
+                     fp_info_copy.stats_list.data() + stats_size);
 
   int global_coding_idx_offset = 0;
   int global_order_idx_offset = 0;
-  std::vector<int> key_frame_list = GetKeyFrameList(firstpass_info);
+  std::vector<int> key_frame_list = GetKeyFrameList(fp_info_copy);
   key_frame_list.push_back(stats_size);  // a sentinel value
   for (size_t ki = 0; ki + 1 < key_frame_list.size(); ++ki) {
     int frames_to_key = key_frame_list[ki + 1] - key_frame_list[ki];
@@ -866,11 +873,11 @@ StatusOr<GopStructList> AV1RateControlQMode::DetermineGopInfo(
 
     std::vector<REGIONS> regions_list(MAX_FIRSTPASS_ANALYSIS_FRAMES);
     int total_regions = 0;
-    av1_identify_regions(firstpass_info.stats_list.data() + key_order_index,
+    av1_identify_regions(fp_info_copy.stats_list.data() + key_order_index,
                          frames_to_key, 0, regions_list.data(), &total_regions);
     regions_list.resize(total_regions);
     std::vector<int> gf_intervals = PartitionGopIntervals(
-        rc_param_, firstpass_info.stats_list, regions_list, key_order_index,
+        rc_param_, fp_info_copy.stats_list, regions_list, key_order_index,
         /*frames_since_key=*/0, frames_to_key);
     for (size_t gi = 0; gi < gf_intervals.size(); ++gi) {
       const bool has_key_frame = gi == 0;
@@ -1011,6 +1018,9 @@ StatusOr<TplFrameDepStats> CreateTplFrameDepStatsWithoutPropagation(
       }
     }
   }
+
+  frame_dep_stats.rdcost = TplFrameDepStatsAccumulateInterCost(frame_dep_stats);
+
   return frame_dep_stats;
 }
 
@@ -1052,7 +1062,19 @@ double TplFrameDepStatsAccumulateIntraCost(
   for (const auto &row : frame_dep_stats.unit_stats) {
     sum = std::accumulate(row.begin(), row.end(), sum, getIntraCost);
   }
-  return sum;
+  return std::max(sum, 1.0);
+}
+
+double TplFrameDepStatsAccumulateInterCost(
+    const TplFrameDepStats &frame_dep_stats) {
+  auto getInterCost = [](double sum, const TplUnitDepStats &unit) {
+    return sum + unit.inter_cost;
+  };
+  double sum = 0;
+  for (const auto &row : frame_dep_stats.unit_stats) {
+    sum = std::accumulate(row.begin(), row.end(), sum, getInterCost);
+  }
+  return std::max(sum, 1.0);
 }
 
 double TplFrameDepStatsAccumulate(const TplFrameDepStats &frame_dep_stats) {
@@ -1063,7 +1085,7 @@ double TplFrameDepStatsAccumulate(const TplFrameDepStats &frame_dep_stats) {
   for (const auto &row : frame_dep_stats.unit_stats) {
     sum = std::accumulate(row.begin(), row.end(), sum, getOverallCost);
   }
-  return sum;
+  return std::max(sum, 1.0);
 }
 
 // This is a generalization of GET_MV_RAWPEL that allows for an arbitrary
@@ -1153,7 +1175,9 @@ void TplFrameDepStatsPropagate(int coding_idx,
 }
 
 std::vector<RefFrameTable> AV1RateControlQMode::GetRefFrameTableList(
-    const GopStruct &gop_struct, RefFrameTable ref_frame_table) {
+    const GopStruct &gop_struct,
+    const std::vector<LookaheadStats> &lookahead_stats,
+    RefFrameTable ref_frame_table) {
   if (gop_struct.global_coding_idx_offset == 0) {
     // For the first GOP, ref_frame_table need not be initialized. This is fine,
     // because the first frame (a key frame) will fully initialize it.
@@ -1180,14 +1204,46 @@ std::vector<RefFrameTable> AV1RateControlQMode::GetRefFrameTableList(
     }
     ref_frame_table_list.push_back(ref_frame_table);
   }
+
+  int gop_size_offset = static_cast<int>(gop_struct.gop_frame_list.size());
+
+  for (const auto &lookahead_stat : lookahead_stats) {
+    for (GopFrame gop_frame : lookahead_stat.gop_struct->gop_frame_list) {
+      if (gop_frame.is_key_frame) {
+        ref_frame_table.assign(rc_param_.ref_frame_table_size, gop_frame);
+      } else if (gop_frame.update_ref_idx != -1) {
+        assert(gop_frame.update_ref_idx <
+               static_cast<int>(ref_frame_table.size()));
+        gop_frame.coding_idx += gop_size_offset;
+        ref_frame_table[gop_frame.update_ref_idx] = gop_frame;
+      }
+      ref_frame_table_list.push_back(ref_frame_table);
+    }
+    gop_size_offset +=
+        static_cast<int>(lookahead_stat.gop_struct->gop_frame_list.size());
+  }
+
   return ref_frame_table_list;
 }
 
 StatusOr<TplGopDepStats> ComputeTplGopDepStats(
     const TplGopStats &tpl_gop_stats,
+    const std::vector<LookaheadStats> &lookahead_stats,
     const std::vector<RefFrameTable> &ref_frame_table_list) {
+  std::vector<const TplFrameStats *> tpl_frame_stats_list_with_lookahead;
+  for (const auto &tpl_frame_stats : tpl_gop_stats.frame_stats_list) {
+    tpl_frame_stats_list_with_lookahead.push_back(&tpl_frame_stats);
+  }
+  for (const auto &lookahead_stat : lookahead_stats) {
+    for (const auto &tpl_frame_stats :
+         lookahead_stat.tpl_gop_stats->frame_stats_list) {
+      tpl_frame_stats_list_with_lookahead.push_back(&tpl_frame_stats);
+    }
+  }
+
   const int frame_count =
-      static_cast<int>(tpl_gop_stats.frame_stats_list.size());
+      static_cast<int>(tpl_frame_stats_list_with_lookahead.size());
+
   // Create the struct to store TPL dependency stats
   TplGopDepStats tpl_gop_dep_stats;
 
@@ -1195,7 +1251,7 @@ StatusOr<TplGopDepStats> ComputeTplGopDepStats(
   for (int coding_idx = 0; coding_idx < frame_count; coding_idx++) {
     const StatusOr<TplFrameDepStats> tpl_frame_dep_stats =
         CreateTplFrameDepStatsWithoutPropagation(
-            tpl_gop_stats.frame_stats_list[coding_idx]);
+            *tpl_frame_stats_list_with_lookahead[coding_idx]);
     if (!tpl_frame_dep_stats.ok()) {
       return tpl_frame_dep_stats.status();
     }
@@ -1213,50 +1269,208 @@ StatusOr<TplGopDepStats> ComputeTplGopDepStats(
   return tpl_gop_dep_stats;
 }
 
-static int GetRDMult(const GopFrame &gop_frame, int qindex) {
+static std::vector<uint8_t> SetupDeltaQ(const TplFrameDepStats &frame_dep_stats,
+                                        int frame_width, int frame_height,
+                                        int base_qindex,
+                                        double frame_importance) {
+  // TODO(jianj) : Add support to various superblock sizes.
+  const int sb_size = 64;
+  const int delta_q_res = 4;
+  const int num_unit_per_sb = sb_size / frame_dep_stats.unit_size;
+  const int sb_rows = (frame_height + sb_size - 1) / sb_size;
+  const int sb_cols = (frame_width + sb_size - 1) / sb_size;
+  const int unit_rows = (frame_height + frame_dep_stats.unit_size - 1) /
+                        frame_dep_stats.unit_size;
+  const int unit_cols =
+      (frame_width + frame_dep_stats.unit_size - 1) / frame_dep_stats.unit_size;
+  std::vector<uint8_t> superblock_q_indices;
+  // Calculate delta_q offset for each superblock.
+  for (int sb_row = 0; sb_row < sb_rows; ++sb_row) {
+    for (int sb_col = 0; sb_col < sb_cols; ++sb_col) {
+      double intra_cost = 0;
+      double mc_dep_cost = 0;
+      const int unit_row_start = sb_row * num_unit_per_sb;
+      const int unit_row_end =
+          std::min((sb_row + 1) * num_unit_per_sb, unit_rows);
+      const int unit_col_start = sb_col * num_unit_per_sb;
+      const int unit_col_end =
+          std::min((sb_col + 1) * num_unit_per_sb, unit_cols);
+      // A simplified version of av1_get_q_for_deltaq_objective()
+      for (int unit_row = unit_row_start; unit_row < unit_row_end; ++unit_row) {
+        for (int unit_col = unit_col_start; unit_col < unit_col_end;
+             ++unit_col) {
+          const TplUnitDepStats &unit_dep_stat =
+              frame_dep_stats.unit_stats[unit_row][unit_col];
+          intra_cost += unit_dep_stat.intra_cost;
+          mc_dep_cost += unit_dep_stat.propagation_cost;
+        }
+      }
+
+      double beta = 1.0;
+      if (mc_dep_cost > 0 && intra_cost > 0) {
+        const double r0 = 1 / frame_importance;
+        const double rk = intra_cost / mc_dep_cost;
+        beta = r0 / rk;
+        assert(beta > 0.0);
+      }
+      int offset = av1_get_deltaq_offset(AOM_BITS_8, base_qindex, beta);
+      offset = std::min(offset, delta_q_res * 9 - 1);
+      offset = std::max(offset, -delta_q_res * 9 + 1);
+      int qindex = offset + base_qindex;
+      qindex = std::min(qindex, MAXQ);
+      qindex = std::max(qindex, MINQ);
+      qindex = av1_adjust_q_from_delta_q_res(delta_q_res, base_qindex, qindex);
+      superblock_q_indices.push_back(static_cast<uint8_t>(qindex));
+    }
+  }
+
+  return superblock_q_indices;
+}
+
+static std::unordered_map<int, double> FindKMeansClusterMap(
+    const std::vector<uint8_t> &qindices,
+    const std::vector<double> &centroids) {
+  std::unordered_map<int, double> cluster_map;
+  for (const uint8_t qindex : qindices) {
+    double nearest_centroid = *std::min_element(
+        centroids.begin(), centroids.end(),
+        [qindex](const double centroid_a, const double centroid_b) {
+          return fabs(centroid_a - qindex) < fabs(centroid_b - qindex);
+        });
+    cluster_map.insert({ qindex, nearest_centroid });
+  }
+  return cluster_map;
+}
+
+namespace internal {
+
+std::unordered_map<int, int> KMeans(std::vector<uint8_t> qindices, int k) {
+  std::vector<double> centroids;
+  // Initialize the centroids with first k qindices
+  std::unordered_set<int> qindices_set;
+
+  for (const uint8_t qp : qindices) {
+    if (!qindices_set.insert(qp).second) continue;  // Already added.
+    centroids.push_back(qp);
+    if (static_cast<int>(centroids.size()) >= k) break;
+  }
+
+  std::unordered_map<int, double> intermediate_cluster_map;
+  while (true) {
+    // Find the closest centroid for each qindex
+    intermediate_cluster_map = FindKMeansClusterMap(qindices, centroids);
+    // For each cluster, calculate the new centroids
+    std::unordered_map<double, std::vector<int>> centroid_to_qindices;
+    for (const auto &qindex_centroid : intermediate_cluster_map) {
+      centroid_to_qindices[qindex_centroid.second].push_back(
+          qindex_centroid.first);
+    }
+    bool centroids_changed = false;
+    std::vector<double> new_centroids;
+    for (const auto &cluster : centroid_to_qindices) {
+      double sum = 0.0;
+      for (const int qindex : cluster.second) {
+        sum += qindex;
+      }
+      double new_centroid = sum / cluster.second.size();
+      new_centroids.push_back(new_centroid);
+      if (new_centroid != cluster.first) centroids_changed = true;
+    }
+    if (!centroids_changed) break;
+    centroids = new_centroids;
+  }
+  std::unordered_map<int, int> cluster_map;
+  for (const auto &qindex_centroid : intermediate_cluster_map) {
+    cluster_map.insert(
+        { qindex_centroid.first, static_cast<int>(qindex_centroid.second) });
+  }
+  return cluster_map;
+}
+}  // namespace internal
+
+static int GetRDMult(const GopFrame &gop_frame, int q_index) {
   // TODO(angiebird):
   // 1) Check if these rdmult rules are good in our use case.
   // 2) Support high-bit-depth mode
   if (gop_frame.is_golden_frame) {
     // Assume ARF_UPDATE/GF_UPDATE share the same remult rule.
-    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, GF_UPDATE, qindex);
+    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, GF_UPDATE, q_index);
   } else if (gop_frame.is_key_frame) {
-    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, KF_UPDATE, qindex);
+    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, KF_UPDATE, q_index);
   } else {
     // Assume LF_UPDATE/OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE/INTNL_ARF_UPDATE
     // share the same remult rule.
-    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, LF_UPDATE, qindex);
+    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, LF_UPDATE, q_index);
   }
 }
 
-StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfo(
-    const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
-    const RefFrameTable &ref_frame_table_snapshot_init) {
-  Status status = ValidateTplStats(gop_struct, tpl_gop_stats);
-  if (!status.ok()) {
-    return status;
+StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfoWithNoStats(
+    const GopStruct &gop_struct) {
+  GopEncodeInfo gop_encode_info;
+  const int frame_count = static_cast<int>(gop_struct.gop_frame_list.size());
+  for (int i = 0; i < frame_count; i++) {
+    FrameEncodeParameters param;
+    const GopFrame &gop_frame = gop_struct.gop_frame_list[i];
+    // Use constant QP for TPL pass encoding. Keep the functionality
+    // that allows QP changes across sub-gop.
+    param.q_index = rc_param_.base_q_index;
+    param.rdmult = av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, LF_UPDATE,
+                                                       rc_param_.base_q_index);
+    // TODO(jingning): gop_frame is needed in two pass tpl later.
+    (void)gop_frame;
+
+    if (rc_param_.tpl_pass_index) {
+      if (gop_frame.update_type == GopFrameType::kRegularGolden ||
+          gop_frame.update_type == GopFrameType::kRegularKey ||
+          gop_frame.update_type == GopFrameType::kRegularArf) {
+        double qstep_ratio = 1 / 3.0;
+        param.q_index = av1_get_q_index_from_qstep_ratio(
+            rc_param_.base_q_index, qstep_ratio, AOM_BITS_8);
+        if (rc_param_.base_q_index) param.q_index = AOMMAX(param.q_index, 1);
+      }
+    }
+    gop_encode_info.param_list.push_back(param);
   }
+  return gop_encode_info;
+}
 
-  const std::vector<RefFrameTable> ref_frame_table_list =
-      GetRefFrameTableList(gop_struct, ref_frame_table_snapshot_init);
+StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfoWithFp(
+    const GopStruct &gop_struct,
+    const FirstpassInfo &firstpass_info AOM_UNUSED) {
+  // TODO(b/260859962): This is currently a placeholder. Should use the fp
+  // stats to calculate frame-level qp.
+  return GetGopEncodeInfoWithNoStats(gop_struct);
+}
+
+StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfoWithTpl(
+    const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+    const std::vector<LookaheadStats> &lookahead_stats,
+    const RefFrameTable &ref_frame_table_snapshot_init) {
+  const std::vector<RefFrameTable> ref_frame_table_list = GetRefFrameTableList(
+      gop_struct, lookahead_stats, ref_frame_table_snapshot_init);
 
   GopEncodeInfo gop_encode_info;
   gop_encode_info.final_snapshot = ref_frame_table_list.back();
-  StatusOr<TplGopDepStats> gop_dep_stats =
-      ComputeTplGopDepStats(tpl_gop_stats, ref_frame_table_list);
+  StatusOr<TplGopDepStats> gop_dep_stats = ComputeTplGopDepStats(
+      tpl_gop_stats, lookahead_stats, ref_frame_table_list);
   if (!gop_dep_stats.ok()) {
     return gop_dep_stats.status();
   }
   const int frame_count =
       static_cast<int>(tpl_gop_stats.frame_stats_list.size());
+  const int active_worst_quality = rc_param_.base_q_index;
+  int active_best_quality = rc_param_.base_q_index;
   for (int i = 0; i < frame_count; i++) {
     FrameEncodeParameters param;
     const GopFrame &gop_frame = gop_struct.gop_frame_list[i];
 
     if (gop_frame.update_type == GopFrameType::kOverlay ||
-        gop_frame.update_type == GopFrameType::kIntermediateOverlay) {
+        gop_frame.update_type == GopFrameType::kIntermediateOverlay ||
+        gop_frame.update_type == GopFrameType::kRegularLeaf) {
       param.q_index = rc_param_.base_q_index;
-    } else {
+    } else if (gop_frame.update_type == GopFrameType::kRegularGolden ||
+               gop_frame.update_type == GopFrameType::kRegularKey ||
+               gop_frame.update_type == GopFrameType::kRegularArf) {
       const TplFrameDepStats &frame_dep_stats =
           gop_dep_stats->frame_dep_stats_list[i];
       const double cost_without_propagation =
@@ -1270,6 +1484,33 @@ StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfo(
       param.q_index = av1_get_q_index_from_qstep_ratio(rc_param_.base_q_index,
                                                        qstep_ratio, AOM_BITS_8);
       if (rc_param_.base_q_index) param.q_index = AOMMAX(param.q_index, 1);
+      active_best_quality = param.q_index;
+
+      if (rc_param_.max_distinct_q_indices_per_frame > 1) {
+        std::vector<uint8_t> superblock_q_indices = SetupDeltaQ(
+            frame_dep_stats, rc_param_.frame_width, rc_param_.frame_height,
+            param.q_index, frame_importance);
+        std::unordered_map<int, int> qindex_centroids = internal::KMeans(
+            superblock_q_indices, rc_param_.max_distinct_q_indices_per_frame);
+        for (size_t i = 0; i < superblock_q_indices.size(); ++i) {
+          const int curr_sb_qindex =
+              qindex_centroids.find(superblock_q_indices[i])->second;
+          const int delta_q_res = 4;
+          const int adjusted_qindex =
+              param.q_index +
+              (curr_sb_qindex - param.q_index) / delta_q_res * delta_q_res;
+          const int rd_mult = GetRDMult(gop_frame, adjusted_qindex);
+          param.superblock_encode_params.push_back(
+              { static_cast<uint8_t>(adjusted_qindex), rd_mult });
+        }
+      }
+    } else {
+      // Intermediate ARFs
+      assert(gop_frame.layer_depth >= 1);
+      const int depth_factor = 1 << (gop_frame.layer_depth - 1);
+      param.q_index =
+          (active_worst_quality * (depth_factor - 1) + active_best_quality) /
+          depth_factor;
     }
     param.rdmult = GetRDMult(gop_frame, param.q_index);
     gop_encode_info.param_list.push_back(param);
@@ -1277,4 +1518,35 @@ StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfo(
   return gop_encode_info;
 }
 
+StatusOr<GopEncodeInfo> AV1RateControlQMode::GetTplPassGopEncodeInfo(
+    const GopStruct &gop_struct, const FirstpassInfo &firstpass_info) {
+  return GetGopEncodeInfoWithFp(gop_struct, firstpass_info);
+}
+
+StatusOr<GopEncodeInfo> AV1RateControlQMode::GetGopEncodeInfo(
+    const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+    const std::vector<LookaheadStats> &lookahead_stats,
+    const FirstpassInfo &firstpass_info AOM_UNUSED,
+    const RefFrameTable &ref_frame_table_snapshot_init) {
+  // When TPL stats are not valid, use first pass stats.
+  Status status = ValidateTplStats(gop_struct, tpl_gop_stats);
+  if (!status.ok()) {
+    return status;
+  }
+
+  for (const auto &lookahead_stat : lookahead_stats) {
+    Status status = ValidateTplStats(*lookahead_stat.gop_struct,
+                                     *lookahead_stat.tpl_gop_stats);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  // TODO(b/260859962): Currently firstpass stats are used as an alternative,
+  // but we could also combine it with tpl results in the future for more
+  // stable qp determination.
+  return GetGopEncodeInfoWithTpl(gop_struct, tpl_gop_stats, lookahead_stats,
+                                 ref_frame_table_snapshot_init);
+}
+
 }  // namespace aom
diff --git a/av1/ratectrl_qmode.h b/av1/qmode_rc/ratectrl_qmode.h
index 7a59687ca..f60000e12 100644
--- a/av1/ratectrl_qmode.h
+++ b/av1/qmode_rc/ratectrl_qmode.h
@@ -9,15 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_RATECTRL_QMODE_H_
-#define AOM_AV1_RATECTRL_QMODE_H_
+#ifndef AOM_AV1_QMODE_RC_RATECTRL_QMODE_H_
+#define AOM_AV1_QMODE_RC_RATECTRL_QMODE_H_
 
 #include <deque>
 #include <queue>
+#include <unordered_map>
 #include <vector>
 #include "av1/encoder/firstpass.h"
-#include "av1/ratectrl_qmode_interface.h"
-#include "av1/reference_manager.h"
+#include "av1/qmode_rc/ratectrl_qmode_interface.h"
+#include "av1/qmode_rc/reference_manager.h"
 
 namespace aom {
 
@@ -35,6 +36,7 @@ struct TplUnitDepStats {
 
 struct TplFrameDepStats {
   int unit_size;  // equivalent to min_block_size
+  double rdcost;  // overall rate-distortion cost
   std::vector<std::vector<TplUnitDepStats>> unit_stats;
 };
 
@@ -77,6 +79,9 @@ std::vector<int> GetKeyFrameList(const FirstpassInfo &first_pass_info);
 double TplFrameDepStatsAccumulateIntraCost(
     const TplFrameDepStats &frame_dep_stats);
 
+double TplFrameDepStatsAccumulateInterCost(
+    const TplFrameDepStats &frame_dep_stats);
+
 double TplFrameDepStatsAccumulate(const TplFrameDepStats &frame_dep_stats);
 
 void TplFrameDepStatsPropagate(int coding_idx,
@@ -85,8 +90,13 @@ void TplFrameDepStatsPropagate(int coding_idx,
 
 int GetBlockOverlapArea(int r0, int c0, int r1, int c1, int size);
 
+namespace internal {
+std::unordered_map<int, int> KMeans(std::vector<uint8_t> qindices, int k);
+}
+
 StatusOr<TplGopDepStats> ComputeTplGopDepStats(
     const TplGopStats &tpl_gop_stats,
+    const std::vector<LookaheadStats> &lookahead_stats,
     const std::vector<RefFrameTable> &ref_frame_table_list);
 
 class AV1RateControlQMode : public AV1RateControlQModeInterface {
@@ -96,7 +106,12 @@ class AV1RateControlQMode : public AV1RateControlQModeInterface {
       const FirstpassInfo &firstpass_info) override;
   StatusOr<GopEncodeInfo> GetGopEncodeInfo(
       const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+      const std::vector<LookaheadStats> &lookahead_stats,
+      const FirstpassInfo &firstpass_info,
       const RefFrameTable &ref_frame_table_snapshot) override;
+  StatusOr<GopEncodeInfo> GetTplPassGopEncodeInfo(
+      const GopStruct &gop_struct,
+      const FirstpassInfo &firstpass_info) override;
 
   // Public for testing only.
   // Returns snapshots of the ref frame before and after each frame in
@@ -104,11 +119,23 @@ class AV1RateControlQMode : public AV1RateControlQModeInterface {
   // If this is first GOP, ref_frame_table is ignored and all refs are assumed
   // invalid; otherwise ref_frame_table is used as the initial state.
   std::vector<RefFrameTable> GetRefFrameTableList(
-      const GopStruct &gop_struct, RefFrameTable ref_frame_table);
+      const GopStruct &gop_struct,
+      const std::vector<LookaheadStats> &lookahead_stats,
+      RefFrameTable ref_frame_table);
 
  private:
   RateControlParam rc_param_;
+
+  // Private methods to determine GOP encode info with different stats
+  StatusOr<GopEncodeInfo> GetGopEncodeInfoWithNoStats(
+      const GopStruct &gop_struct);
+  StatusOr<GopEncodeInfo> GetGopEncodeInfoWithFp(
+      const GopStruct &gop_struct, const FirstpassInfo &firstpass_info);
+  StatusOr<GopEncodeInfo> GetGopEncodeInfoWithTpl(
+      const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+      const std::vector<LookaheadStats> &lookahead_stats,
+      const RefFrameTable &ref_frame_table_snapshot_init);
 };
 }  // namespace aom
 
-#endif  // AOM_AV1_RATECTRL_QMODE_H_
+#endif  // AOM_AV1_QMODE_RC_RATECTRL_QMODE_H_
diff --git a/av1/ratectrl_qmode_interface.cc b/av1/qmode_rc/ratectrl_qmode_interface.cc
index eb29e4330..1f03e0c13 100644
--- a/av1/ratectrl_qmode_interface.cc
+++ b/av1/qmode_rc/ratectrl_qmode_interface.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/ratectrl_qmode_interface.h"
+#include "av1/qmode_rc/ratectrl_qmode_interface.h"
 
 namespace aom {
 
diff --git a/av1/ratectrl_qmode_interface.h b/av1/qmode_rc/ratectrl_qmode_interface.h
index 1e4e5c663..a7fff4a32 100644
--- a/av1/ratectrl_qmode_interface.h
+++ b/av1/qmode_rc/ratectrl_qmode_interface.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_RATECTRL_QMODE_INTERFACE_H_
-#define AOM_AV1_RATECTRL_QMODE_INTERFACE_H_
+#ifndef AOM_AV1_QMODE_RC_RATECTRL_QMODE_INTERFACE_H_
+#define AOM_AV1_QMODE_RC_RATECTRL_QMODE_INTERFACE_H_
 
 #include <array>
 #include <string>
@@ -24,11 +24,18 @@ namespace aom {
 constexpr int kBlockRefCount = 2;
 
 struct MotionVector {
-  int row;          // subpel row
-  int col;          // subpel col
+  int row;  // subpel row
+  int col;  // subpel col
+  // TODO(b/241589513): Move this to TplFrameStats; it's wasteful to code it
+  // separately for each block.
   int subpel_bits;  // number of fractional bits used by row/col
 };
 
+enum class TplPassCount {
+  kOneTplPass = 1,
+  kTwoTplPasses = 2,
+};
+
 struct RateControlParam {
   // Range of allowed GOP sizes (number of displayed frames).
   int max_gop_show_frame_count;
@@ -37,23 +44,42 @@ struct RateControlParam {
   int ref_frame_table_size;
   // Maximum number of references a single frame may use.
   int max_ref_frames;
-  // Maximum pyramid depth. e.g., 1 means only one ARF per GOP,
-  // 2 would allow an additional level of intermediate ARFs.
-  int max_depth;
 
   int base_q_index;
 
+  // If greater than 1, enables per-superblock q_index, and limits the number of
+  // unique q_index values which may be used in a frame (each of which will have
+  // its own unique rdmult value).
+  int max_distinct_q_indices_per_frame;
+
+  // If per-superblock q_index is enabled and this is greater than 1, enables
+  // additional per-superblock scaling of lambda, and limits the number of
+  // unique lambda scale values which may be used in a frame.
+  int max_distinct_lambda_scales_per_frame;
+
   int frame_width;
   int frame_height;
+
+  // Total number of TPL passes.
+  TplPassCount tpl_pass_count = TplPassCount::kOneTplPass;
+  // Current TPL pass number, 0 or 1 (for GetTplPassGopEncodeInfo).
+  int tpl_pass_index = 0;
 };
 
 struct TplBlockStats {
-  int height;  // pixel height
-  int width;   // pixel width
-  int row;     // pixel row of the top left corner
-  int col;     // pixel col of the top lef corner
-  int64_t intra_cost;
-  int64_t inter_cost;
+  int16_t height;      // Pixel height.
+  int16_t width;       // Pixel width.
+  int16_t row;         // Pixel row of the top left corner.
+  int16_t col;         // Pixel col of the top left corner.
+  int64_t intra_cost;  // Rd cost of the best intra mode.
+  int64_t inter_cost;  // Rd cost of the best inter mode.
+
+  // Valid only if TplFrameStats::rate_dist_present is true:
+  int64_t recrf_rate;      // Bits when using recon as reference.
+  int64_t recrf_dist;      // Distortion when using recon as reference.
+  int64_t intra_pred_err;  // Prediction residual of the intra mode.
+  int64_t inter_pred_err;  // Prediction residual of the inter mode.
+
   std::array<MotionVector, kBlockRefCount> mv;
   std::array<int, kBlockRefCount> ref_frame_index;
 };
@@ -208,9 +234,35 @@ struct GopStruct {
 
 using GopStructList = std::vector<GopStruct>;
 
+struct SuperblockEncodeParameters {
+  int q_index;
+  int rdmult;
+};
+
 struct FrameEncodeParameters {
+  // Base q_index for the frame.
   int q_index;
+
+  // Frame level Lagrangian multiplier.
   int rdmult;
+
+  // If max_distinct_q_indices_per_frame <= 1, this will be empty.
+  // Otherwise:
+  // - There must be one entry per 64x64 superblock, in row-major order
+  // - There may be no more than max_distinct_q_indices_per_frame unique q_index
+  //   values
+  // - All entries with the same q_index must have the same rdmult
+  // (If it's desired to use different rdmult values with the same q_index, this
+  // must be done with superblock_lambda_scales.)
+  std::vector<SuperblockEncodeParameters> superblock_encode_params;
+
+  // If max_distinct_q_indices_per_frame <= 1 or
+  // max_distinct_lambda_scales_per_frame <= 1, this will be empty. Otherwise,
+  // it will have one entry per 64x64 superblock, in row-major order, with no
+  // more than max_distinct_lambda_scales_per_frame unique values. Each entry
+  // should be multiplied by the rdmult in the corresponding superblock's entry
+  // in superblock_encode_params.
+  std::vector<float> superblock_lambda_scales;
 };
 
 struct FirstpassInfo {
@@ -233,13 +285,23 @@ struct TplFrameStats {
   int min_block_size;
   int frame_width;
   int frame_height;
+  bool rate_dist_present;  // True if recrf_rate and recrf_dist are populated.
   std::vector<TplBlockStats> block_stats_list;
+  // Optional stats computed with different settings, should be empty unless
+  // tpl_pass_count == kTwoTplPasses.
+  std::vector<TplBlockStats> alternate_block_stats_list;
 };
 
 struct TplGopStats {
   std::vector<TplFrameStats> frame_stats_list;
 };
 
+// Structure and TPL stats for a single GOP, to be used for lookahead.
+struct LookaheadStats {
+  const GopStruct *gop_struct;       // Not owned, may not be nullptr.
+  const TplGopStats *tpl_gop_stats;  // Not owned, may not be nullptr.
+};
+
 class AV1RateControlQModeInterface {
  public:
   AV1RateControlQModeInterface();
@@ -248,16 +310,49 @@ class AV1RateControlQModeInterface {
   virtual Status SetRcParam(const RateControlParam &rc_param) = 0;
   virtual StatusOr<GopStructList> DetermineGopInfo(
       const FirstpassInfo &firstpass_info) = 0;
-  // Accept firstpass and TPL info from the encoder and return q index and
-  // rdmult. This needs to be called with consecutive GOPs as returned by
-  // DetermineGopInfo.
+
+  // Accepts GOP structure and TPL info from the encoder and returns q index and
+  // rdmult for each frame. This should be called with consecutive GOPs as
+  // returned by DetermineGopInfo.
+  //
+  // GOP structure and TPL info from zero or more subsequent GOPs may optionally
+  // be passed in lookahead_stats.
+  //
   // For the first GOP, a default-constructed RefFrameTable may be passed in as
   // ref_frame_table_snapshot_init; for subsequent GOPs, it should be the
   // final_snapshot returned on the previous call.
+  //
+  // TODO(b/260859962): Remove these once all callers and overrides are gone.
+  virtual StatusOr<GopEncodeInfo> GetGopEncodeInfo(
+      const GopStruct &gop_struct AOM_UNUSED,
+      const TplGopStats &tpl_gop_stats AOM_UNUSED,
+      const std::vector<LookaheadStats> &lookahead_stats AOM_UNUSED,
+      const RefFrameTable &ref_frame_table_snapshot AOM_UNUSED) {
+    return Status{ AOM_CODEC_UNSUP_FEATURE, "Deprecated" };
+  }
+  virtual StatusOr<GopEncodeInfo> GetTplPassGopEncodeInfo(
+      const GopStruct &gop_struct AOM_UNUSED) {
+    return Status{ AOM_CODEC_UNSUP_FEATURE, "Deprecated" };
+  }
+
+  // Extensions to the API to pass in the first pass info. There should be stats
+  // for all frames starting from the first frame of the GOP and continuing to
+  // the end of the sequence.
+  // TODO(b/260859962): Make pure virtual once all derived classes implement it.
   virtual StatusOr<GopEncodeInfo> GetGopEncodeInfo(
-      const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
-      const RefFrameTable &ref_frame_table_snapshot_init) = 0;
+      const GopStruct &gop_struct AOM_UNUSED,
+      const TplGopStats &tpl_gop_stats AOM_UNUSED,
+      const std::vector<LookaheadStats> &lookahead_stats AOM_UNUSED,
+      const FirstpassInfo &firstpass_info AOM_UNUSED,
+      const RefFrameTable &ref_frame_table_snapshot AOM_UNUSED) {
+    return Status{ AOM_CODEC_UNSUP_FEATURE, "Not yet implemented" };
+  }
+  virtual StatusOr<GopEncodeInfo> GetTplPassGopEncodeInfo(
+      const GopStruct &gop_struct AOM_UNUSED,
+      const FirstpassInfo &firstpass_info AOM_UNUSED) {
+    return Status{ AOM_CODEC_UNSUP_FEATURE, "Not yet implemented" };
+  }
 };
 }  // namespace aom
 
-#endif  // AOM_AV1_RATECTRL_QMODE_INTERFACE_H_
+#endif  // AOM_AV1_QMODE_RC_RATECTRL_QMODE_INTERFACE_H_
diff --git a/av1/reference_manager.cc b/av1/qmode_rc/reference_manager.cc
index 456cec9ad..eea7b7d63 100644
--- a/av1/reference_manager.cc
+++ b/av1/qmode_rc/reference_manager.cc
@@ -15,8 +15,8 @@
 #include <tuple>
 #include <vector>
 
-#include "av1/reference_manager.h"
-#include "av1/ratectrl_qmode.h"
+#include "av1/qmode_rc/reference_manager.h"
+#include "av1/qmode_rc/ratectrl_qmode.h"
 
 namespace aom {
 
@@ -80,11 +80,18 @@ int RefFrameManager::GetRefFrameIdxByPriority(RefUpdateType ref_update_type,
                                               int priority_idx) const {
   if (ref_update_type == RefUpdateType::kForward) {
     int size = static_cast<int>(forward_stack_.size());
+    // When two or more forward reference frames can be used, first get
+    // the highest quality one as the ARF, then going from nearest to
+    // the more distant ones in the forward reference frame list.
     if (priority_idx < size) {
-      if (priority_idx == 0)
-        return forward_stack_[priority_idx];
-      else
+      if (allow_two_fwd_frames_) {
+        if (priority_idx == 0) return forward_stack_[0];
         return forward_stack_[size - priority_idx];
+      }
+
+      // Handle the special case where only one forward reference frame
+      // can be used. In this setting, we prefer the nearest frame.
+      return forward_stack_[size - 1 - priority_idx];
     }
   } else if (ref_update_type == RefUpdateType::kBackward) {
     int size = static_cast<int>(backward_queue_.size());
@@ -126,8 +133,8 @@ ReferenceName get_ref_name(RefUpdateType ref_update_type, int priority_idx,
   const std::vector<ReferenceName> forward_name_list{
     ReferenceName::kAltrefFrame,  ReferenceName::kBwdrefFrame,
     ReferenceName::kAltref2Frame, ReferenceName::kGoldenFrame,
-    ReferenceName::kLastFrame,    ReferenceName::kLast2Frame,
-    ReferenceName::kLast3Frame
+    ReferenceName::kLast3Frame,   ReferenceName::kLast2Frame,
+    ReferenceName::kLastFrame
   };
   const std::vector<ReferenceName> backward_name_list{
     ReferenceName::kGoldenFrame, ReferenceName::kLastFrame,
@@ -178,8 +185,10 @@ std::vector<ReferenceFrame> RefFrameManager::GetRefFrameListByPriority() const {
   std::vector<ReferenceFrame> ref_frame_list;
   int ref_frame_count = 0;
   int round_robin_idx = 0;
+
   std::set<ReferenceName> used_name_set;
-  while (ref_frame_count < available_ref_frames) {
+  while (ref_frame_count < available_ref_frames &&
+         ref_frame_count < max_ref_frames_) {
     const RefUpdateType ref_update_type = round_robin_list[round_robin_idx];
     int priority_idx = priority_idx_list[round_robin_idx];
     int ref_idx = GetRefFrameIdxByPriority(ref_update_type, priority_idx);
@@ -270,13 +279,14 @@ ReferenceFrame RefFrameManager::GetPrimaryRefFrame(
     const GopFrame &gop_frame) const {
   assert(gop_frame.is_valid);
   std::vector<std::pair<PrimaryRefKey, int>> candidate_list;
-  for (int ref_idx = 0; ref_idx < static_cast<int>(ref_frame_table_.size());
-       ++ref_idx) {
-    const GopFrame &ref_frame = ref_frame_table_[ref_idx];
+  for (auto &ref_frame_in_gop_frame : gop_frame.ref_frame_list) {
+    const GopFrame &ref_frame = ref_frame_table_[ref_frame_in_gop_frame.index];
     if (ref_frame.is_valid) {
-      assert(ref_idx == ref_frame.update_ref_idx);
+      assert(ref_frame_in_gop_frame.index == ref_frame.update_ref_idx);
       PrimaryRefKey key = get_primary_ref_key(gop_frame, ref_frame);
-      std::pair<PrimaryRefKey, int> candidate = { key, ref_idx };
+      std::pair<PrimaryRefKey, int> candidate = {
+        key, ref_frame_in_gop_frame.index
+      };
       candidate_list.push_back(candidate);
     }
   }
@@ -284,11 +294,10 @@ ReferenceFrame RefFrameManager::GetPrimaryRefFrame(
   std::sort(candidate_list.begin(), candidate_list.end());
 
   ReferenceFrame ref_frame = { -1, ReferenceName::kNoneFrame };
-  std::vector<ReferenceFrame> ref_frame_list = GetRefFrameListByPriority();
-  assert(candidate_list.size() == ref_frame_list.size());
+  assert(candidate_list.size() == gop_frame.ref_frame_list.size());
   if (!candidate_list.empty()) {
     int ref_idx = candidate_list[0].second;
-    for (const auto &frame : ref_frame_list) {
+    for (const auto &frame : gop_frame.ref_frame_list) {
       if (frame.index == ref_idx) {
         ref_frame = frame;
       }
@@ -298,6 +307,9 @@ ReferenceFrame RefFrameManager::GetPrimaryRefFrame(
 }
 
 void RefFrameManager::UpdateRefFrameTable(GopFrame *gop_frame) {
+  allow_two_fwd_frames_ =
+      (max_ref_frames_ - !!GetRefFrameCountByType(RefUpdateType::kBackward) -
+       !!GetRefFrameCountByType(RefUpdateType::kLast)) >= 2;
   gop_frame->ref_frame_list = GetRefFrameListByPriority();
   gop_frame->primary_ref_frame = GetPrimaryRefFrame(*gop_frame);
   gop_frame->colocated_ref_idx = ColocatedRefIdx(gop_frame->global_order_idx);
diff --git a/av1/reference_manager.h b/av1/qmode_rc/reference_manager.h
index 59bfda38e..37b50381d 100644
--- a/av1/reference_manager.h
+++ b/av1/qmode_rc/reference_manager.h
@@ -9,14 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_REFERENCE_MANAGER_H_
-#define AOM_AV1_REFERENCE_MANAGER_H_
+#ifndef AOM_AV1_QMODE_RC_REFERENCE_MANAGER_H_
+#define AOM_AV1_QMODE_RC_REFERENCE_MANAGER_H_
 
 #include <deque>
 #include <iostream>
 #include <vector>
 
-#include "av1/ratectrl_qmode_interface.h"
+#include "av1/qmode_rc/ratectrl_qmode_interface.h"
 
 namespace aom {
 
@@ -24,8 +24,9 @@ enum class RefUpdateType { kForward, kBackward, kLast, kNone };
 
 class RefFrameManager {
  public:
-  explicit RefFrameManager(int ref_frame_table_size)
-      : ref_frame_table_(ref_frame_table_size) {
+  explicit RefFrameManager(int ref_frame_table_size, int max_ref_frames)
+      : ref_frame_table_(ref_frame_table_size),
+        max_ref_frames_(max_ref_frames) {
     // forward_max_size_ define max number of arf frames that can exists at
     // the same time. In the other words, it's the max size of forward_stack_.
     // TODO(angiebird): Figure out if this number is optimal.
@@ -72,6 +73,7 @@ class RefFrameManager {
   void UpdateOrder(int global_order_idx);
   int ColocatedRefIdx(int global_order_idx);
   int ForwardMaxSize() const { return forward_max_size_; }
+  int MaxRefFrame() const { return max_ref_frames_; }
   int CurGlobalOrderIdx() const { return cur_global_order_idx_; }
   void UpdateRefFrameTable(GopFrame *gop_frame);
   ReferenceFrame GetPrimaryRefFrame(const GopFrame &gop_frame) const;
@@ -80,6 +82,8 @@ class RefFrameManager {
   int forward_max_size_;
   int cur_global_order_idx_;
   RefFrameTable ref_frame_table_;
+  int max_ref_frames_;
+  bool allow_two_fwd_frames_;
   std::deque<int> free_ref_idx_list_;
   std::vector<int> forward_stack_;
   std::deque<int> backward_queue_;
@@ -88,4 +92,4 @@ class RefFrameManager {
 
 }  // namespace aom
 
-#endif  // AOM_AV1_REFERENCE_MANAGER_H_
+#endif  // AOM_AV1_QMODE_RC_REFERENCE_MANAGER_H_
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index f1af79735..6cf53f08c 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -65,8 +65,11 @@ std::unique_ptr<AV1RateControlRTC> AV1RateControlRTC::Create(
   rc_api->cpi_->ppi =
       static_cast<AV1_PRIMARY *>(aom_memalign(32, sizeof(AV1_PRIMARY)));
   if (!rc_api->cpi_->ppi) return nullptr;
+  av1_zero(*rc_api->cpi_->ppi);
   rc_api->cpi_->common.seq_params = &rc_api->cpi_->ppi->seq_params;
   av1_zero(*rc_api->cpi_->common.seq_params);
+  const int num_layers = cfg.ss_number_layers * cfg.ts_number_layers;
+  if (!av1_alloc_layer_context(rc_api->cpi_, num_layers)) return nullptr;
   rc_api->InitRateControl(cfg);
   if (cfg.aq_mode) {
     AV1_COMP *const cpi = rc_api->cpi_;
@@ -94,6 +97,9 @@ AV1RateControlRTC::~AV1RateControlRTC() {
         }
       }
     }
+    aom_free(cpi_->svc.layer_context);
+    cpi_->svc.layer_context = nullptr;
+
     if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
       aom_free(cpi_->enc_seg.map);
       cpi_->enc_seg.map = nullptr;
@@ -232,7 +238,6 @@ void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
     gf_group->update_type[cpi_->gf_frame_index] = KF_UPDATE;
     gf_group->frame_type[cpi_->gf_frame_index] = KEY_FRAME;
     gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_RESET;
-    cpi_->rc.frames_since_key = 0;
     if (cpi_->ppi->use_svc) {
       const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
                                          cpi_->svc.temporal_layer_id,
@@ -251,8 +256,9 @@ void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
                                          cpi_->svc.number_temporal_layers);
       cpi_->svc.layer_context[layer].is_key_frame = 0;
     }
-    cpi_->rc.frames_since_key++;
   }
+  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+    cpi_->rc.frames_since_key++;
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1) {
     av1_update_temporal_layer_framerate(cpi_);
@@ -262,11 +268,13 @@ void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
   if (cpi_->oxcf.rc_cfg.mode == AOM_CBR) {
     if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
       av1_cyclic_refresh_update_parameters(cpi_);
-    if (frame_is_intra_only(cm))
+    if (frame_is_intra_only(cm)) {
       target = av1_calc_iframe_target_size_one_pass_cbr(cpi_);
-    else
+      cpi_->common.current_frame.frame_number = 0;
+    } else {
       target = av1_calc_pframe_target_size_one_pass_cbr(
           cpi_, gf_group->update_type[cpi_->gf_frame_index]);
+    }
   }
   av1_rc_set_frame_target(cpi_, target, cm->width, cm->height);
 
@@ -292,11 +300,11 @@ int *AV1RateControlRTC::GetDeltaQ() const {
 }
 
 void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  cpi_->common.current_frame.frame_number++;
   av1_rc_postencode_update(cpi_, encoded_frame_size);
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1)
     av1_save_layer_context(cpi_);
-  cpi_->common.current_frame.frame_number++;
 }
 
 }  // namespace aom
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 84159fba2..eafd26cf5 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -24,19 +24,13 @@ set_aom_detect_var(INLINE "" "Sets INLINE value for current target.")
 
 # CPUs.
 set_aom_detect_var(ARCH_ARM 0 "Enables ARM architecture.")
-set_aom_detect_var(ARCH_MIPS 0 "Enables MIPS architecture.")
 set_aom_detect_var(ARCH_PPC 0 "Enables PPC architecture.")
 set_aom_detect_var(ARCH_X86 0 "Enables X86 architecture.")
 set_aom_detect_var(ARCH_X86_64 0 "Enables X86_64 architecture.")
 
 # ARM feature flags.
 set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.")
-
-# MIPS feature flags.
-set_aom_detect_var(HAVE_DSPR2 0 "Enables DSPR2 optimizations.")
-set_aom_detect_var(HAVE_MIPS32 0 "Enables MIPS32 optimizations.")
-set_aom_detect_var(HAVE_MIPS64 0 "Enables MIPS64 optimizations. ")
-set_aom_detect_var(HAVE_MSA 0 "Enables MSA optimizations.")
+set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.")
 
 # PPC feature flags.
 set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
@@ -190,11 +184,6 @@ set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
 # ARM assembly/intrinsics flags.
 set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
 
-# MIPS assembly/intrinsics flags.
-set_aom_option_var(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets."
-                   OFF)
-set_aom_option_var(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF)
-
 # VSX intrinsics flags.
 set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
                    ON)
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 26c521ede..ee566af65 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -40,6 +40,10 @@ if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH)
                          "FORCE_HIGHBITDEPTH_DECODING")
 endif()
 
+if(CONFIG_THREE_PASS AND NOT CONFIG_AV1_DECODER)
+  change_config_and_warn(CONFIG_THREE_PASS 0 "CONFIG_AV1_DECODER=0")
+endif()
+
 # Generate the user config settings.
 list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
 foreach(cache_var ${aom_build_vars})
@@ -67,7 +71,7 @@ if(NOT AOM_TARGET_CPU)
     endif()
   elseif(cpu_lowercase STREQUAL "i386" OR cpu_lowercase STREQUAL "x86")
     set(AOM_TARGET_CPU "x86")
-  elseif(cpu_lowercase MATCHES "^arm" OR cpu_lowercase MATCHES "^mips")
+  elseif(cpu_lowercase MATCHES "^arm")
     set(AOM_TARGET_CPU "${cpu_lowercase}")
   elseif(cpu_lowercase MATCHES "aarch64")
     set(AOM_TARGET_CPU "arm64")
@@ -300,7 +304,17 @@ else()
   add_compiler_flag_if_supported("-Wall")
   add_compiler_flag_if_supported("-Wdisabled-optimization")
   add_compiler_flag_if_supported("-Wextra")
-  add_compiler_flag_if_supported("-Wextra-semi")
+  # Prior to version 3.19.0 cmake would fail to parse the warning emitted by gcc
+  # with this flag. Note the order of this check and -Wextra-semi-stmt is
+  # important due to is_flag_present() matching substrings with string(FIND
+  # ...).
+  if(CMAKE_VERSION VERSION_LESS "3.19"
+     AND CMAKE_C_COMPILER_ID STREQUAL "GNU"
+     AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 10)
+    add_cxx_flag_if_supported("-Wextra-semi")
+  else()
+    add_compiler_flag_if_supported("-Wextra-semi")
+  endif()
   add_compiler_flag_if_supported("-Wextra-semi-stmt")
   add_compiler_flag_if_supported("-Wfloat-conversion")
   add_compiler_flag_if_supported("-Wformat=2")
@@ -314,6 +328,7 @@ else()
   add_compiler_flag_if_supported("-Wuninitialized")
   add_compiler_flag_if_supported("-Wunused")
   add_compiler_flag_if_supported("-Wvla")
+  add_cxx_flag_if_supported("-Wc++14-extensions")
   add_cxx_flag_if_supported("-Wc++17-extensions")
   add_cxx_flag_if_supported("-Wc++20-extensions")
 
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index ef2d7552b..99ac38ab5 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -20,33 +20,19 @@ if("${AOM_TARGET_CPU}" MATCHES "^arm")
     set(HAVE_NEON 0)
     set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
   endif()
-elseif("${AOM_TARGET_CPU}" MATCHES "^mips")
-  set(ARCH_MIPS 1)
-  set(RTCD_ARCH_MIPS "yes")
 
-  if("${AOM_TARGET_CPU}" STREQUAL "mips32")
-    set(HAVE_MIPS32 1)
-    set(RTCD_HAVE_MIPS32 "yes")
-  elseif("${AOM_TARGET_CPU}" STREQUAL "mips64")
-    set(HAVE_MIPS64 1)
-    set(RTCD_HAVE_MIPS64 "yes")
-  endif()
-
-  # HAVE_DSPR2 is set by mips toolchain files.
-  if(ENABLE_DSPR2 AND HAVE_DSPR2)
-    set(RTCD_HAVE_DSPR2 "yes")
+  check_c_source_compiles("
+    #if !defined(__ARM_FEATURE_CRC32) || __ARM_FEATURE_CRC32 != 1
+    #error \"CRC32 is unavailable.\"
+    #endif
+    int main(void) { return 0; }" HAVE_CRC32)
+  if(HAVE_CRC32)
+    set(HAVE_ARM_CRC32 1)
   else()
-    set(HAVE_DSPR2 0)
-    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-dspr2)
+    set(HAVE_ARM_CRC32 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-arm_crc32)
   endif()
 
-  # HAVE_MSA is set by mips toolchain files.
-  if(ENABLE_MSA AND HAVE_MSA)
-    set(RTCD_HAVE_MSA "yes")
-  else()
-    set(HAVE_MSA 0)
-    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-msa)
-  endif()
 elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
   set(ARCH_PPC 1)
   set(RTCD_ARCH_PPC "yes")
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index e9f75dd44..bd3b9d534 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -321,38 +321,6 @@ EOF
   common_bottom;
 }
 
-sub mips() {
-  determine_indirection("c", @ALL_ARCHS);
-
-  # Assign the helper variable for each enabled extension
-  foreach my $opt (@ALL_ARCHS) {
-    my $opt_uc = uc $opt;
-    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
-  }
-
-  common_top;
-
-  print <<EOF;
-#include "config/aom_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-EOF
-
-  set_function_pointers("c", @ALL_ARCHS);
-
-  print <<EOF;
-#if HAVE_DSPR2
-void aom_dsputil_static_init();
-aom_dsputil_static_init();
-#endif
-}
-#endif
-EOF
-  common_bottom;
-}
-
 sub ppc() {
   determine_indirection("c", @ALL_ARCHS);
 
@@ -420,20 +388,12 @@ if ($opts{arch} eq 'x86') {
   @REQUIRES = filter(qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
-} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
-  @ALL_ARCHS = filter("$opts{arch}");
-  if (aom_config("HAVE_DSPR2") eq "yes") {
-    @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
-  } elsif (aom_config("HAVE_MSA") eq "yes") {
-    @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
-  }
-  mips;
 } elsif ($opts{arch} =~ /armv[78]\w?/) {
   @ALL_ARCHS = filter(qw/neon/);
   arm;
 } elsif ($opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon/);
-  &require("neon");
+  @ALL_ARCHS = filter(qw/neon arm_crc32/);
+  &require(@ALL_ARCHS);
   arm;
 } elsif ($opts{arch} eq 'ppc') {
   @ALL_ARCHS = filter(qw/vsx/);
diff --git a/build/cmake/toolchains/arm64-linux-gcc.cmake b/build/cmake/toolchains/arm64-linux-gcc.cmake
index fc4b277bb..64e460b60 100644
--- a/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -21,9 +21,15 @@ if("${CROSS}" STREQUAL "")
   set(CROSS aarch64-linux-gnu-)
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(AS_EXECUTABLE ${CROSS}as)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT AS_EXECUTABLE)
+  set(AS_EXECUTABLE ${CROSS}as)
+endif()
 set(CMAKE_C_FLAGS_INIT "-march=armv8-a")
 set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
 set(AOM_AS_FLAGS "-march=armv8-a")
diff --git a/build/cmake/toolchains/arm64-macos.cmake b/build/cmake/toolchains/arm64-macos.cmake
new file mode 100644
index 000000000..99f8d16e1
--- /dev/null
+++ b/build/cmake/toolchains/arm64-macos.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "arm64")
+set(CMAKE_C_FLAGS_INIT "-arch arm64")
+set(CMAKE_CXX_FLAGS_INIT "-arch arm64")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch arm64")
diff --git a/build/cmake/toolchains/arm64-mingw-gcc.cmake b/build/cmake/toolchains/arm64-mingw-gcc.cmake
index a8e15cb31..5472ed4d4 100644
--- a/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -20,10 +20,18 @@ if("${CROSS}" STREQUAL "")
   set(CROSS aarch64-w64-mingw32-)
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
-set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+  set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+  set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
 
 # No runtime cpu detect for arm64-mingw-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/armv7-linux-gcc.cmake b/build/cmake/toolchains/armv7-linux-gcc.cmake
index 26c028f11..1201538a2 100644
--- a/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -25,9 +25,15 @@ if(NOT ${CROSS} MATCHES hf-$)
   set(AOM_EXTRA_TOOLCHAIN_FLAGS "-mfloat-abi=softfp")
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(AS_EXECUTABLE ${CROSS}as)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT AS_EXECUTABLE)
+  set(AS_EXECUTABLE ${CROSS}as)
+endif()
 set(CMAKE_C_FLAGS_INIT "-march=armv7-a -mfpu=vfpv3 \
                           ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -mfpu=vfpv3 \
diff --git a/build/cmake/toolchains/armv7-mingw-gcc.cmake b/build/cmake/toolchains/armv7-mingw-gcc.cmake
index 2dc4b1882..8a928916d 100644
--- a/build/cmake/toolchains/armv7-mingw-gcc.cmake
+++ b/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -20,10 +20,18 @@ if("${CROSS}" STREQUAL "")
   set(CROSS armv7-w64-mingw32-)
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
-set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+  set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+  set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
 
 # No runtime cpu detect for armv7-mingw-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/mips32-linux-gcc.cmake b/build/cmake/toolchains/mips32-linux-gcc.cmake
deleted file mode 100644
index ad5ebffdc..000000000
--- a/build/cmake/toolchains/mips32-linux-gcc.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-#
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and the
-# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
-# not distributed with this source code in the LICENSE file, you can obtain it
-# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
-# License 1.0 was not distributed with this source code in the PATENTS file, you
-# can obtain it at www.aomedia.org/license/patent.
-#
-if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_)
-  return()
-endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_
-set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_ 1)
-
-set(CMAKE_SYSTEM_NAME "Linux")
-
-if(ENABLE_DSPR2 AND ENABLE_MSA)
-  message(FATAL_ERROR "ENABLE_DSPR2 and ENABLE_MSA cannot be combined.")
-endif()
-
-if(ENABLE_DSPR2)
-  set(HAVE_DSPR2 1 CACHE BOOL "" FORCE)
-
-  if("${CROSS}" STREQUAL "")
-
-    # Default the cross compiler prefix to something known to work.
-    set(CROSS mips-linux-gnu-)
-  endif()
-
-  set(MIPS_CFLAGS "-mdspr2")
-  set(MIPS_CXXFLAGS "-mdspr2")
-elseif(ENABLE_MSA)
-  set(HAVE_MSA 1 CACHE BOOL "" FORCE)
-
-  if("${CROSS}" STREQUAL "")
-
-    # Default the cross compiler prefix to something known to work.
-    set(CROSS mips-mti-linux-gnu-)
-  endif()
-
-  set(MIPS_CFLAGS "-mmsa")
-  set(MIPS_CXXFLAGS "-mmsa")
-endif()
-
-if("${CROSS}" STREQUAL "")
-
-  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
-  # be desired on a mips host.  Default cross compiler prefix to something that
-  # might work for an  unoptimized build.
-  set(CROSS mips-linux-gnu-)
-endif()
-
-if("${MIPS_CPU}" STREQUAL "")
-  set(MIPS_CFLAGS "${MIPS_CFLAGS} -mips32r2")
-  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} -mips32r2")
-elseif("${MIPS_CPU}" STREQUAL "p5600")
-  set(P56_FLAGS
-      "-mips32r5 -mload-store-pairs -msched-weight -mhard-float -mfp64")
-  set(MIPS_CFLAGS "${MIPS_CFLAGS} ${P56_FLAGS}")
-  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${P56_FLAGS}")
-  set(CMAKE_EXE_LINKER_FLAGS "-mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
-endif()
-
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_FLAGS_INIT "-EL ${MIPS_CFLAGS}")
-set(CMAKE_CXX_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
-set(CMAKE_SYSTEM_PROCESSOR "mips32")
-
-# No runtime cpu detect for mips32-linux-gcc.
-if(CONFIG_RUNTIME_CPU_DETECT)
-  message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips32 targets.")
-endif()
-
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "" FORCE)
diff --git a/build/cmake/toolchains/mips64-linux-gcc.cmake b/build/cmake/toolchains/mips64-linux-gcc.cmake
deleted file mode 100644
index 0af992451..000000000
--- a/build/cmake/toolchains/mips64-linux-gcc.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and the
-# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
-# not distributed with this source code in the LICENSE file, you can obtain it
-# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
-# License 1.0 was not distributed with this source code in the PATENTS file, you
-# can obtain it at www.aomedia.org/license/patent.
-#
-if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_)
-  return()
-endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_
-set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_ 1)
-
-set(CMAKE_SYSTEM_NAME "Linux")
-
-if("${CROSS}" STREQUAL "")
-
-  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
-  # be desired on a mips host.
-  #
-  # Default the cross compiler prefix to something known to work.
-  set(CROSS mips-img-linux-gnu-)
-endif()
-
-if(ENABLE_MSA)
-  set(HAVE_MSA 1 CACHE BOOL "" FORCE)
-  set(MIPS_CFLAGS "-mmsa")
-  set(MIPS_CXXFLAGS "-mmsa")
-endif()
-
-if("${MIPS_CPU}" STREQUAL "i6400" OR "${MIPS_CPU}" STREQUAL "p6600")
-  set(MIPS_CPU_FLAGS "-mips64r6 -mabi=64 -mload-store-pairs -msched-weight")
-  set(MIPS_CPU_FLAGS "${MIPS_CPU_FLAGS} -mhard-float -mfp64")
-  set(MIPS_CFLAGS "${MIPS_CFLAGS} ${MIPS_CPU_FLAGS}")
-  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${MIPS_CPU_FLAGS}")
-  set(CMAKE_EXE_LINKER_FLAGS
-      "-mips64r6 -mabi64 -mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
-endif()
-
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_FLAGS_INIT "-EL ${MIPS_CFLAGS}")
-set(CMAKE_CXX_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
-set(CMAKE_SYSTEM_PROCESSOR "mips64")
-
-# No runtime cpu detect for mips64-linux-gcc.
-if(CONFIG_RUNTIME_CPU_DETECT)
-  message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips64 targets.")
-endif()
-
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "" FORCE)
diff --git a/build/cmake/toolchains/ppc-linux-gcc.cmake b/build/cmake/toolchains/ppc-linux-gcc.cmake
index 54db99bb4..ab0efeab0 100644
--- a/build/cmake/toolchains/ppc-linux-gcc.cmake
+++ b/build/cmake/toolchains/ppc-linux-gcc.cmake
@@ -21,9 +21,15 @@ if("${CROSS}" STREQUAL "")
   set(CROSS powerpc64le-unknown-linux-gnu-)
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(AS_EXECUTABLE ${CROSS}as)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT AS_EXECUTABLE)
+  set(AS_EXECUTABLE ${CROSS}as)
+endif()
 set(CMAKE_SYSTEM_PROCESSOR "ppc")
 
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/riscv-linux-gcc.cmake b/build/cmake/toolchains/riscv-linux-gcc.cmake
new file mode 100644
index 000000000..21e73709d
--- /dev/null
+++ b/build/cmake/toolchains/riscv-linux-gcc.cmake
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_RISCV_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_RISCV_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_RISCV_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS riscv64-unknown-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT AS_EXECUTABLE)
+  set(AS_EXECUTABLE ${CROSS}as)
+endif()
+set(CMAKE_SYSTEM_PROCESSOR "riscv")
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/x86-mingw-gcc.cmake b/build/cmake/toolchains/x86-mingw-gcc.cmake
index 2e9a9a84b..f75728f2c 100644
--- a/build/cmake/toolchains/x86-mingw-gcc.cmake
+++ b/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -23,7 +23,15 @@ if("${CROSS}" STREQUAL "")
   set(CROSS i686-w64-mingw32-)
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
-set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+  set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+  set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
diff --git a/build/cmake/toolchains/x86_64-macos.cmake b/build/cmake/toolchains/x86_64-macos.cmake
new file mode 100644
index 000000000..899df6f35
--- /dev/null
+++ b/build/cmake/toolchains/x86_64-macos.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "x86_64")
+set(CMAKE_C_FLAGS_INIT "-arch x86_64")
+set(CMAKE_CXX_FLAGS_INIT "-arch x86_64")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch x86_64")
diff --git a/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/build/cmake/toolchains/x86_64-mingw-gcc.cmake
index 4b2d28deb..56e9b6ecb 100644
--- a/build/cmake/toolchains/x86_64-mingw-gcc.cmake
+++ b/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -20,7 +20,15 @@ if("${CROSS}" STREQUAL "")
   set(CROSS x86_64-w64-mingw32-)
 endif()
 
-set(CMAKE_C_COMPILER ${CROSS}gcc)
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
-set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
-set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+  set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+  set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
diff --git a/common/ivfdec.c b/common/ivfdec.c
index 18f053e3a..6e714d1cf 100644
--- a/common/ivfdec.c
+++ b/common/ivfdec.c
@@ -17,6 +17,7 @@
 
 #include "aom_ports/mem_ops.h"
 #include "aom_ports/sanitizer.h"
+#include "tools_common.h"
 
 static const char *IVF_SIGNATURE = "DKIF";
 
@@ -29,10 +30,10 @@ static void fix_framerate(int *num, int *den) {
 }
 
 int file_is_ivf(struct AvxInputContext *input_ctx) {
-  char raw_hdr[32];
+  unsigned char raw_hdr[32];
   int is_ivf = 0;
 
-  if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) {
+  if (buffer_input(input_ctx, 32, raw_hdr, /*buffered=*/true) == 32) {
     if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) {
       is_ivf = 1;
 
@@ -53,21 +54,21 @@ int file_is_ivf(struct AvxInputContext *input_ctx) {
   }
 
   if (!is_ivf) {
-    rewind(input_ctx->file);
-    input_ctx->detect.buf_read = 0;
-  } else {
-    input_ctx->detect.position = 4;
+    rewind_detect(input_ctx);
   }
   return is_ivf;
 }
 
-int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
-                   size_t *buffer_size, aom_codec_pts_t *pts) {
-  char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
+int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size,
+                   aom_codec_pts_t *pts) {
+  unsigned char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
   size_t frame_size = 0;
 
-  if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) fprintf(stderr, "Warning: Failed to read frame size\n");
+  if (read_from_input(input_ctx, IVF_FRAME_HDR_SZ, raw_header) !=
+      IVF_FRAME_HDR_SZ) {
+    if (!input_eof(input_ctx))
+      fprintf(stderr, "Warning: Failed to read frame size\n");
   } else {
     frame_size = mem_get_le32(raw_header);
 
@@ -95,9 +96,9 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
     }
   }
 
-  if (!feof(infile)) {
+  if (!input_eof(input_ctx)) {
     ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
-    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+    if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) {
       fprintf(stderr, "Warning: Failed to read full frame\n");
       return 1;
     }
diff --git a/common/ivfdec.h b/common/ivfdec.h
index dbc77331f..e8fe8d0c5 100644
--- a/common/ivfdec.h
+++ b/common/ivfdec.h
@@ -19,8 +19,9 @@ extern "C" {
 #endif
 
 int file_is_ivf(struct AvxInputContext *input);
-int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
-                   size_t *buffer_size, aom_codec_pts_t *pts);
+int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size,
+                   aom_codec_pts_t *pts);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/common/md5_utils.c b/common/md5_utils.c
index b69e1cc72..c69aa57a3 100644
--- a/common/md5_utils.c
+++ b/common/md5_utils.c
@@ -150,19 +150,26 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
 #define AOM_NO_UNSIGNED_OVERFLOW_CHECK \
   __attribute__((no_sanitize("unsigned-integer-overflow")))
 #endif
-#endif
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+  __attribute__((no_sanitize("unsigned-shift-base")))
+#endif  // __clang__ >= 12
+#endif  // __clang__
 
 #ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK
 #define AOM_NO_UNSIGNED_OVERFLOW_CHECK
 #endif
+#ifndef AOM_NO_UNSIGNED_SHIFT_CHECK
+#define AOM_NO_UNSIGNED_SHIFT_CHECK
+#endif
 
 /*
  * The core of the MD5 algorithm, this alters an existing MD5 hash to
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-AOM_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
-                                                 UWORD32 const in[16]) {
+AOM_NO_UNSIGNED_OVERFLOW_CHECK AOM_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+    UWORD32 buf[4], UWORD32 const in[16]) {
   register UWORD32 a, b, c, d;
 
   a = buf[0];
@@ -245,5 +252,6 @@ AOM_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
 }
 
 #undef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#undef AOM_NO_UNSIGNED_SHIFT_CHECK
 
 #endif
diff --git a/common/obudec.c b/common/obudec.c
index 650f9973b..8b7bd39a6 100644
--- a/common/obudec.c
+++ b/common/obudec.c
@@ -20,6 +20,7 @@
 #include "aom_ports/mem_ops.h"
 #include "av1/common/common.h"
 #include "av1/common/obu_util.h"
+#include "tools_common.h"
 
 #define OBU_BUFFER_SIZE (500 * 1024)
 
@@ -35,15 +36,18 @@
 
 // Reads unsigned LEB128 integer and returns 0 upon successful read and decode.
 // Stores raw bytes in 'value_buffer', length of the number in 'value_length',
-// and decoded value in 'value'.
-static int obudec_read_leb128(FILE *f, uint8_t *value_buffer,
-                              size_t *value_length, uint64_t *value) {
-  if (!f || !value_buffer || !value_length || !value) return -1;
+// and decoded value in 'value'. If 'buffered' is true, it is buffered in the
+// detect buffer first.
+static int obudec_read_leb128(struct AvxInputContext *input_ctx,
+                              uint8_t *value_buffer, size_t *value_length,
+                              uint64_t *value, bool buffered) {
+  if (!input_ctx || !value_buffer || !value_length || !value) return -1;
   size_t len;
   for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) {
-    const size_t num_read = fread(&value_buffer[len], 1, 1, f);
+    const size_t num_read =
+        buffer_input(input_ctx, 1, &value_buffer[len], buffered);
     if (num_read == 0) {
-      if (len == 0 && feof(f)) {
+      if (len == 0 && input_eof(input_ctx)) {
         *value_length = 0;
         return 0;
       }
@@ -60,22 +64,24 @@ static int obudec_read_leb128(FILE *f, uint8_t *value_buffer,
   return aom_uleb_decode(value_buffer, len, value, NULL);
 }
 
-// Reads OBU header from 'f'. The 'buffer_capacity' passed in must be large
-// enough to store an OBU header with extension (2 bytes). Raw OBU data is
+// Reads OBU header from 'input_ctx'. The 'buffer_capacity' passed in must be
+// large enough to store an OBU header with extension (2 bytes). Raw OBU data is
 // written to 'obu_data', parsed OBU header values are written to 'obu_header',
 // and total bytes read from file are written to 'bytes_read'. Returns 0 for
 // success, and non-zero on failure. When end of file is reached, the return
-// value is 0 and the 'bytes_read' value is set to 0.
-static int obudec_read_obu_header(FILE *f, size_t buffer_capacity,
-                                  int is_annexb, uint8_t *obu_data,
-                                  ObuHeader *obu_header, size_t *bytes_read) {
-  if (!f || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) ||
+// value is 0 and the 'bytes_read' value is set to 0. If 'buffered' is true, it
+// is buffered in the detect buffer first.
+static int obudec_read_obu_header(struct AvxInputContext *input_ctx,
+                                  size_t buffer_capacity, int is_annexb,
+                                  uint8_t *obu_data, ObuHeader *obu_header,
+                                  size_t *bytes_read, bool buffered) {
+  if (!input_ctx || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) ||
       !obu_data || !obu_header || !bytes_read) {
     return -1;
   }
-  *bytes_read = fread(obu_data, 1, 1, f);
+  *bytes_read = buffer_input(input_ctx, 1, obu_data, buffered);
 
-  if (feof(f) && *bytes_read == 0) {
+  if (input_eof(input_ctx) && *bytes_read == 0) {
     return 0;
   } else if (*bytes_read != 1) {
     fprintf(stderr, "obudec: Failure reading OBU header.\n");
@@ -84,7 +90,7 @@ static int obudec_read_obu_header(FILE *f, size_t buffer_capacity,
 
   const int has_extension = (obu_data[0] >> 2) & 0x1;
   if (has_extension) {
-    if (fread(&obu_data[1], 1, 1, f) != 1) {
+    if (buffer_input(input_ctx, 1, &obu_data[1], buffered) != 1) {
       fprintf(stderr, "obudec: Failure reading OBU extension.");
       return -1;
     }
@@ -102,14 +108,17 @@ static int obudec_read_obu_header(FILE *f, size_t buffer_capacity,
   return 0;
 }
 
-// Reads OBU payload from 'f' and returns 0 for success when all payload bytes
-// are read from the file. Payload data is written to 'obu_data', and actual
-// bytes read added to 'bytes_read'.
-static int obudec_read_obu_payload(FILE *f, size_t payload_length,
-                                   uint8_t *obu_data, size_t *bytes_read) {
-  if (!f || payload_length == 0 || !obu_data || !bytes_read) return -1;
-
-  if (fread(obu_data, 1, payload_length, f) != payload_length) {
+// Reads OBU payload from 'input_ctx' and returns 0 for success when all payload
+// bytes are read from the file. Payload data is written to 'obu_data', and
+// actual bytes read added to 'bytes_read'. If 'buffered' is true, it is
+// buffered in the detect buffer first.
+static int obudec_read_obu_payload(struct AvxInputContext *input_ctx,
+                                   size_t payload_length, uint8_t *obu_data,
+                                   size_t *bytes_read, bool buffered) {
+  if (!input_ctx || payload_length == 0 || !obu_data || !bytes_read) return -1;
+
+  if (buffer_input(input_ctx, payload_length, obu_data, buffered) !=
+      payload_length) {
     fprintf(stderr, "obudec: Failure reading OBU payload.\n");
     return -1;
   }
@@ -118,13 +127,12 @@ static int obudec_read_obu_payload(FILE *f, size_t payload_length,
   return 0;
 }
 
-static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
-                                           int is_annexb, uint8_t *buffer,
-                                           size_t *bytes_read,
-                                           size_t *payload_length,
-                                           ObuHeader *obu_header) {
+static int obudec_read_obu_header_and_size(
+    struct AvxInputContext *input_ctx, size_t buffer_capacity, int is_annexb,
+    uint8_t *buffer, size_t *bytes_read, size_t *payload_length,
+    ObuHeader *obu_header, bool buffered) {
   const size_t kMinimumBufferSize = OBU_MAX_HEADER_SIZE;
-  if (!f || !buffer || !bytes_read || !payload_length || !obu_header ||
+  if (!input_ctx || !buffer || !bytes_read || !payload_length || !obu_header ||
       buffer_capacity < kMinimumBufferSize) {
     return -1;
   }
@@ -133,7 +141,8 @@ static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
   size_t leb128_length_payload = 0;
   uint64_t obu_size = 0;
   if (is_annexb) {
-    if (obudec_read_leb128(f, &buffer[0], &leb128_length_obu, &obu_size) != 0) {
+    if (obudec_read_leb128(input_ctx, &buffer[0], &leb128_length_obu, &obu_size,
+                           buffered) != 0) {
       fprintf(stderr, "obudec: Failure reading OBU size length.\n");
       return -1;
     } else if (leb128_length_obu == 0) {
@@ -147,9 +156,9 @@ static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
   }
 
   size_t header_size = 0;
-  if (obudec_read_obu_header(f, buffer_capacity - leb128_length_obu, is_annexb,
-                             buffer + leb128_length_obu, obu_header,
-                             &header_size) != 0) {
+  if (obudec_read_obu_header(input_ctx, buffer_capacity - leb128_length_obu,
+                             is_annexb, buffer + leb128_length_obu, obu_header,
+                             &header_size, buffered) != 0) {
     return -1;
   } else if (header_size == 0) {
     *payload_length = 0;
@@ -165,8 +174,9 @@ static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
     *payload_length = (size_t)obu_size - header_size;
   } else {
     uint64_t u64_payload_length = 0;
-    if (obudec_read_leb128(f, &buffer[leb128_length_obu + header_size],
-                           &leb128_length_payload, &u64_payload_length) != 0) {
+    if (obudec_read_leb128(input_ctx, &buffer[leb128_length_obu + header_size],
+                           &leb128_length_payload, &u64_payload_length,
+                           buffered) != 0) {
       fprintf(stderr, "obudec: Failure reading OBU payload length.\n");
       return -1;
     }
@@ -214,11 +224,12 @@ static int obudec_grow_buffer(size_t growth_amount, uint8_t **obu_buffer,
   return 0;
 }
 
-static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer,
-                               size_t obu_bytes_buffered,
+static int obudec_read_one_obu(struct AvxInputContext *input_ctx,
+                               uint8_t **obu_buffer, size_t obu_bytes_buffered,
                                size_t *obu_buffer_capacity, size_t *obu_length,
-                               ObuHeader *obu_header, int is_annexb) {
-  if (!f || !(*obu_buffer) || !obu_buffer_capacity || !obu_length ||
+                               ObuHeader *obu_header, int is_annexb,
+                               bool buffered) {
+  if (!input_ctx || !(*obu_buffer) || !obu_buffer_capacity || !obu_length ||
       !obu_header) {
     return -1;
   }
@@ -238,8 +249,9 @@ static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer,
   }
 
   const int status = obudec_read_obu_header_and_size(
-      f, available_buffer_capacity, is_annexb, *obu_buffer + obu_bytes_buffered,
-      &bytes_read, &obu_payload_length, obu_header);
+      input_ctx, available_buffer_capacity, is_annexb,
+      *obu_buffer + obu_bytes_buffered, &bytes_read, &obu_payload_length,
+      obu_header, buffered);
   if (status < 0) return status;
 
   if (obu_payload_length > SIZE_MAX - bytes_read) return -1;
@@ -259,9 +271,9 @@ static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer,
   }
 
   if (obu_payload_length > 0 &&
-      obudec_read_obu_payload(f, obu_payload_length,
+      obudec_read_obu_payload(input_ctx, obu_payload_length,
                               *obu_buffer + obu_bytes_buffered + bytes_read,
-                              &bytes_read) != 0) {
+                              &bytes_read, buffered) != 0) {
     return -1;
   }
 
@@ -275,7 +287,6 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
   struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx;
   uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 };
   const int is_annexb = obu_ctx->is_annexb;
-  FILE *f = avx_ctx->file;
   size_t payload_length = 0;
   ObuHeader obu_header;
   memset(&obu_header, 0, sizeof(obu_header));
@@ -285,16 +296,19 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
 
   if (is_annexb) {
     // read the size of first temporal unit
-    if (obudec_read_leb128(f, &detect_buf[0], &length_of_unit_size,
-                           &unit_size) != 0) {
+    if (obudec_read_leb128(avx_ctx, &detect_buf[0], &length_of_unit_size,
+                           &unit_size, /*buffered=*/true) != 0) {
       fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+      rewind_detect(avx_ctx);
       return 0;
     }
 
     // read the size of first frame unit
-    if (obudec_read_leb128(f, &detect_buf[length_of_unit_size],
-                           &annexb_header_length, &unit_size) != 0) {
+    if (obudec_read_leb128(avx_ctx, &detect_buf[length_of_unit_size],
+                           &annexb_header_length, &unit_size,
+                           /*buffered=*/true) != 0) {
       fprintf(stderr, "obudec: Failure reading frame unit header\n");
+      rewind_detect(avx_ctx);
       return 0;
     }
     annexb_header_length += length_of_unit_size;
@@ -302,11 +316,11 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
 
   size_t bytes_read = 0;
   if (obudec_read_obu_header_and_size(
-          f, OBU_DETECTION_SIZE - annexb_header_length, is_annexb,
+          avx_ctx, OBU_DETECTION_SIZE - annexb_header_length, is_annexb,
           &detect_buf[annexb_header_length], &bytes_read, &payload_length,
-          &obu_header) != 0) {
+          &obu_header, /*buffered=*/true) != 0) {
     fprintf(stderr, "obudec: Failure reading first OBU.\n");
-    rewind(f);
+    rewind_detect(avx_ctx);
     return 0;
   }
 
@@ -316,6 +330,7 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
 
   if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
       obu_header.type != OBU_SEQUENCE_HEADER) {
+    rewind_detect(avx_ctx);
     return 0;
   }
 
@@ -324,12 +339,12 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
       fprintf(
           stderr,
           "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero).");
-      rewind(f);
+      rewind_detect(avx_ctx);
       return 0;
     }
   } else if (!is_annexb) {
     fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n");
-    rewind(f);
+    rewind_detect(avx_ctx);
     return 0;
   }
 
@@ -337,7 +352,7 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
   obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE);
   if (!obu_ctx->buffer) {
     fprintf(stderr, "Out of memory.\n");
-    rewind(f);
+    rewind_detect(avx_ctx);
     return 0;
   }
   obu_ctx->buffer_capacity = OBU_BUFFER_SIZE;
@@ -349,15 +364,18 @@ int file_is_obu(struct ObuDecInputContext *obu_ctx) {
   if (payload_length > 0) {
     if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) {
       fprintf(stderr, "obudec: First OBU's payload is too large\n");
-      rewind(f);
+      rewind_detect(avx_ctx);
+      obudec_free(obu_ctx);
       return 0;
     }
 
     size_t payload_bytes = 0;
     const int status = obudec_read_obu_payload(
-        f, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes);
+        avx_ctx, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes,
+        /*buffered=*/false);
     if (status < 0) {
-      rewind(f);
+      rewind_detect(avx_ctx);
+      obudec_free(obu_ctx);
       return 0;
     }
     obu_ctx->bytes_buffered += payload_bytes;
@@ -374,7 +392,7 @@ int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
   *buffer_size = 0;
   *bytes_read = 0;
 
-  if (feof(f)) {
+  if (input_eof(obu_ctx->avx_ctx)) {
     return 1;
   }
 
@@ -387,12 +405,13 @@ int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
     uint64_t size = 0;
 
     if (obu_ctx->bytes_buffered == 0) {
-      if (obudec_read_leb128(f, &tuheader[0], &length_of_temporal_unit_size,
-                             &size) != 0) {
+      if (obudec_read_leb128(obu_ctx->avx_ctx, &tuheader[0],
+                             &length_of_temporal_unit_size, &size,
+                             /*buffered=*/false) != 0) {
         fprintf(stderr, "obudec: Failure reading temporal unit header\n");
         return -1;
       }
-      if (size == 0 && feof(f)) {
+      if (size == 0 && input_eof(obu_ctx->avx_ctx)) {
         return 1;
       }
     } else {
@@ -416,9 +435,10 @@ int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
       ObuHeader obu_header;
       memset(&obu_header, 0, sizeof(obu_header));
 
-      if (obudec_read_one_obu(f, &obu_ctx->buffer, obu_ctx->bytes_buffered,
+      if (obudec_read_one_obu(obu_ctx->avx_ctx, &obu_ctx->buffer,
+                              obu_ctx->bytes_buffered,
                               &obu_ctx->buffer_capacity, &obu_size, &obu_header,
-                              0) != 0) {
+                              0, /*buffered=*/false) != 0) {
         fprintf(stderr, "obudec: read_one_obu failed in TU loop\n");
         return -1;
       }
@@ -459,7 +479,7 @@ int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
             obu_size);
     obu_ctx->bytes_buffered = obu_size;
   } else {
-    if (!feof(f)) {
+    if (!input_eof(obu_ctx->avx_ctx)) {
       size_t data_size;
       size_t offset;
       if (!obu_ctx->bytes_buffered) {
@@ -474,7 +494,8 @@ int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
         obu_ctx->bytes_buffered -= copy_size;
       }
 
-      if (fread(*buffer + offset, 1, data_size, f) != data_size) {
+      if (read_from_input(obu_ctx->avx_ctx, data_size, *buffer + offset) !=
+          data_size) {
         fprintf(stderr, "obudec: Failed to read full temporal unit\n");
         return -1;
       }
@@ -483,4 +504,9 @@ int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
   return 0;
 }
 
-void obudec_free(struct ObuDecInputContext *obu_ctx) { free(obu_ctx->buffer); }
+void obudec_free(struct ObuDecInputContext *obu_ctx) {
+  free(obu_ctx->buffer);
+  obu_ctx->buffer = NULL;
+  obu_ctx->buffer_capacity = 0;
+  obu_ctx->bytes_buffered = 0;
+}
diff --git a/common/tools_common.c b/common/tools_common.c
index 86f05b118..6b579e0c5 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c
@@ -556,3 +556,72 @@ void aom_img_write_nv12(const aom_image_t *img, FILE *file) {
     vbuf += (stride - w * size);
   }
 }
+
+size_t read_from_input(struct AvxInputContext *input_ctx, size_t n,
+                       unsigned char *buf) {
+  const size_t buffered_bytes =
+      input_ctx->detect.buf_read - input_ctx->detect.position;
+  size_t read_n;
+  if (buffered_bytes == 0) {
+    read_n = fread(buf, 1, n, input_ctx->file);
+  } else if (n <= buffered_bytes) {
+    memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position, n);
+    input_ctx->detect.position += n;
+    read_n = n;
+  } else {
+    memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position,
+           buffered_bytes);
+    input_ctx->detect.position += buffered_bytes;
+    read_n = buffered_bytes;
+    read_n +=
+        fread(buf + buffered_bytes, 1, n - buffered_bytes, input_ctx->file);
+  }
+  return read_n;
+}
+
+size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n) {
+  if (n + input_ctx->detect.position > DETECT_BUF_SZ) {
+    die("Failed to store in the detect buffer, maximum size exceeded.");
+  }
+  const size_t buffered_bytes =
+      input_ctx->detect.buf_read - input_ctx->detect.position;
+  size_t read_n;
+  if (buffered_bytes == 0) {
+    read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1, n,
+                   input_ctx->file);
+    input_ctx->detect.buf_read += read_n;
+  } else if (n <= buffered_bytes) {
+    // In this case, don't need to do anything as the data is already in
+    // the detect buffer
+    read_n = n;
+  } else {
+    read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1,
+                   n - buffered_bytes, input_ctx->file);
+    input_ctx->detect.buf_read += read_n;
+    read_n += buffered_bytes;
+  }
+  return read_n;
+}
+
+// Read from detect buffer to a buffer. If not enough, read from input and also
+// buffer them first.
+size_t buffer_input(struct AvxInputContext *input_ctx, size_t n,
+                    unsigned char *buf, bool buffered) {
+  if (!buffered) {
+    return read_from_input(input_ctx, n, buf);
+  }
+  const size_t buf_n = input_to_detect_buf(input_ctx, n);
+  if (buf_n < n) {
+    return buf_n;
+  }
+  return read_from_input(input_ctx, n, buf);
+}
+
+void rewind_detect(struct AvxInputContext *input_ctx) {
+  input_ctx->detect.position = 0;
+}
+
+bool input_eof(struct AvxInputContext *input_ctx) {
+  return feof(input_ctx->file) &&
+         input_ctx->detect.position == input_ctx->detect.buf_read;
+}
diff --git a/common/tools_common.h b/common/tools_common.h
index 77494dea3..eeccbe466 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h
@@ -68,6 +68,9 @@ typedef long FileOffset; /* NOLINT */
 #define IVF_FILE_HDR_SZ 32
 
 #define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+#define OBU_DETECTION_SZ 34  // See common/obudec.c
+
+#define DETECT_BUF_SZ 34  // Max of the above header sizes
 
 #define AV1_FOURCC 0x31305641
 
@@ -83,7 +86,7 @@ enum VideoFileType {
 #define LST_FOURCC 0x4354534c
 
 struct FileTypeDetectionBuffer {
-  char buf[4];
+  char buf[DETECT_BUF_SZ];
   size_t buf_read;
   size_t position;
 };
@@ -188,6 +191,14 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
 // Output in NV12 format.
 void aom_img_write_nv12(const aom_image_t *img, FILE *file);
 
+size_t read_from_input(struct AvxInputContext *input_ctx, size_t n,
+                       unsigned char *buf);
+size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n);
+size_t buffer_input(struct AvxInputContext *input_ctx, size_t n,
+                    unsigned char *buf, bool buffered);
+void rewind_detect(struct AvxInputContext *input_ctx);
+bool input_eof(struct AvxInputContext *input_ctx);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/common/video_reader.c b/common/video_reader.c
index 7b021bc40..27f69a967 100644
--- a/common/video_reader.c
+++ b/common/video_reader.c
@@ -32,7 +32,9 @@ struct AvxVideoReaderStruct {
 
 AvxVideoReader *aom_video_reader_open(const char *filename) {
   AvxVideoReader *reader = NULL;
-  FILE *const file = fopen(filename, "rb");
+  const bool using_file = strcmp(filename, "-") != 0;
+  FILE *const file =
+      using_file ? fopen(filename, "rb") : set_binary_mode(stdin);
   if (!file) return NULL;  // Can't open file
 
   reader = (AvxVideoReader *)calloc(1, sizeof(*reader));
@@ -46,18 +48,24 @@ AvxVideoReader *aom_video_reader_open(const char *filename) {
   reader->obu_ctx.avx_ctx = &reader->input_ctx;
   reader->obu_ctx.is_annexb = 1;
 
-  if (file_is_ivf(&reader->input_ctx)) {
-    reader->input_ctx.file_type = FILE_TYPE_IVF;
-    reader->info.codec_fourcc = reader->input_ctx.fourcc;
-    reader->info.frame_width = reader->input_ctx.width;
-    reader->info.frame_height = reader->input_ctx.height;
+  // TODO(https://crbug.com/aomedia/1706): webm type does not support reading
+  // from stdin yet, and file_is_webm is not using the detect buffer when
+  // determining the type. Therefore it should only be checked when using a file
+  // and needs to be checked prior to other types.
+  if (false) {
 #if CONFIG_WEBM_IO
-  } else if (file_is_webm(&reader->webm_ctx, &reader->input_ctx)) {
+  } else if (using_file &&
+             file_is_webm(&reader->webm_ctx, &reader->input_ctx)) {
     reader->input_ctx.file_type = FILE_TYPE_WEBM;
     reader->info.codec_fourcc = reader->input_ctx.fourcc;
     reader->info.frame_width = reader->input_ctx.width;
     reader->info.frame_height = reader->input_ctx.height;
 #endif
+  } else if (file_is_ivf(&reader->input_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_IVF;
+    reader->info.codec_fourcc = reader->input_ctx.fourcc;
+    reader->info.frame_width = reader->input_ctx.width;
+    reader->info.frame_height = reader->input_ctx.height;
   } else if (file_is_obu(&reader->obu_ctx)) {
     reader->input_ctx.file_type = FILE_TYPE_OBU;
     // assume AV1
@@ -85,7 +93,7 @@ void aom_video_reader_close(AvxVideoReader *reader) {
 
 int aom_video_reader_read_frame(AvxVideoReader *reader) {
   if (reader->input_ctx.file_type == FILE_TYPE_IVF) {
-    return !ivf_read_frame(reader->input_ctx.file, &reader->buffer,
+    return !ivf_read_frame(&reader->input_ctx, &reader->buffer,
                            &reader->frame_size, &reader->buffer_size,
                            &reader->pts);
   } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
diff --git a/common/webmdec.h b/common/webmdec.h
index 5ac75cb30..fcbdeffe4 100644
--- a/common/webmdec.h
+++ b/common/webmdec.h
@@ -28,7 +28,7 @@ struct WebmInputContext {
   const void *block;
   int block_frame_index;
   int video_track_index;
-  uint64_t timestamp_ns;
+  int64_t timestamp_ns;
   int is_key_frame;
   int reached_eos;
 };
diff --git a/common/y4menc.c b/common/y4menc.c
index eaeedba57..7d3246546 100644
--- a/common/y4menc.c
+++ b/common/y4menc.c
@@ -52,30 +52,25 @@ static const char *colorspace(unsigned int bit_depth,
   switch (bit_depth) {
     case 8: return colorspace8(csp, fmt);
     case 9:
-      return fmt == AOM_IMG_FMT_I44416
-                 ? "C444p9 XYSCSS=444P9"
-                 : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9"
-                                             : "C420p9 XYSCSS=420P9";
+      return fmt == AOM_IMG_FMT_I44416   ? "C444p9 XYSCSS=444P9"
+             : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9"
+                                         : "C420p9 XYSCSS=420P9";
     case 10:
-      return fmt == AOM_IMG_FMT_I44416
-                 ? "C444p10 XYSCSS=444P10"
-                 : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10"
-                                             : "C420p10 XYSCSS=420P10";
+      return fmt == AOM_IMG_FMT_I44416   ? "C444p10 XYSCSS=444P10"
+             : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10"
+                                         : "C420p10 XYSCSS=420P10";
     case 12:
-      return fmt == AOM_IMG_FMT_I44416
-                 ? "C444p12 XYSCSS=444P12"
-                 : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12"
-                                             : "C420p12 XYSCSS=420P12";
+      return fmt == AOM_IMG_FMT_I44416   ? "C444p12 XYSCSS=444P12"
+             : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12"
+                                         : "C420p12 XYSCSS=420P12";
     case 14:
-      return fmt == AOM_IMG_FMT_I44416
-                 ? "C444p14 XYSCSS=444P14"
-                 : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14"
-                                             : "C420p14 XYSCSS=420P14";
+      return fmt == AOM_IMG_FMT_I44416   ? "C444p14 XYSCSS=444P14"
+             : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14"
+                                         : "C420p14 XYSCSS=420P14";
     case 16:
-      return fmt == AOM_IMG_FMT_I44416
-                 ? "C444p16 XYSCSS=444P16"
-                 : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16"
-                                             : "C420p16 XYSCSS=420P16";
+      return fmt == AOM_IMG_FMT_I44416   ? "C444p16 XYSCSS=444P16"
+             : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16"
+                                         : "C420p16 XYSCSS=420P16";
     default: assert(0); return NULL;
   }
 }
diff --git a/config/arm/config/aom_config.asm b/config/arm/config/aom_config.asm
index 5a3bb6ad2..0ea9a460c 100644
--- a/config/arm/config/aom_config.asm
+++ b/config/arm/config/aom_config.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2022, Alliance for Open Media. All rights reserved
+; Copyright (c) 2023, Alliance for Open Media. All rights reserved
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,7 +9,6 @@
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ARCH_ARM equ 1
-ARCH_MIPS equ 0
 ARCH_PPC equ 0
 ARCH_X86 equ 0
 ARCH_X86_64 equ 0
@@ -67,14 +66,11 @@ CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
 FORCE_HIGHBITDEPTH_DECODING equ 0
+HAVE_ARM_CRC32 equ 0
 HAVE_AVX equ 0
 HAVE_AVX2 equ 0
-HAVE_DSPR2 equ 0
 HAVE_FEXCEPT equ 1
-HAVE_MIPS32 equ 0
-HAVE_MIPS64 equ 0
 HAVE_MMX equ 0
-HAVE_MSA equ 0
 HAVE_NEON equ 1
 HAVE_PTHREAD_H equ 1
 HAVE_SSE equ 0
diff --git a/config/arm/config/aom_config.h b/config/arm/config/aom_config.h
index f59508241..616a56fb6 100644
--- a/config/arm/config/aom_config.h
+++ b/config/arm/config/aom_config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,7 +11,6 @@
 #ifndef AOM_CONFIG_H_
 #define AOM_CONFIG_H_
 #define ARCH_ARM 1
-#define ARCH_MIPS 0
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
@@ -69,14 +68,11 @@
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
 #define FORCE_HIGHBITDEPTH_DECODING 0
+#define HAVE_ARM_CRC32 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_DSPR2 0
 #define HAVE_FEXCEPT 1
-#define HAVE_MIPS32 0
-#define HAVE_MIPS64 0
 #define HAVE_MMX 0
-#define HAVE_MSA 0
 #define HAVE_NEON 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_SSE 0
diff --git a/config/arm/config/aom_dsp_rtcd.h b/config/arm/config/aom_dsp_rtcd.h
index 9348bfe4b..7ae663693 100644
--- a/config/arm/config/aom_dsp_rtcd.h
+++ b/config/arm/config/aom_dsp_rtcd.h
@@ -488,9 +488,12 @@ void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, int
 unsigned int aom_get_mb_ss_c(const int16_t *);
 #define aom_get_mb_ss aom_get_mb_ss_c
 
-void aom_get_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void aom_get_sse_sum_8x8_quad_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define aom_get_sse_sum_8x8_quad aom_get_sse_sum_8x8_quad_neon
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
+#define aom_get_var_sse_sum_16x16_dual aom_get_var_sse_sum_16x16_dual_c
+
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+#define aom_get_var_sse_sum_8x8_quad aom_get_var_sse_sum_8x8_quad_neon
 
 void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -2778,6 +2781,9 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
 
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
+
 void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
 
@@ -2787,6 +2793,9 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
 
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
+
 void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
 
@@ -2796,6 +2805,9 @@ unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_c
 
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
+
 void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_c
 
@@ -2805,6 +2817,9 @@ unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_c
 
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
+
 void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_c
 
@@ -2814,6 +2829,9 @@ unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_c
 
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
+
 void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_c
 
@@ -2823,6 +2841,9 @@ unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_c
 
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
+
 void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_c
 
@@ -2832,6 +2853,9 @@ unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_c
 
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
+
 void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_c
 
@@ -2841,6 +2865,9 @@ unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_c
 
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
+
 void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_c
 
@@ -2850,6 +2877,9 @@ unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_c
 
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
+
 void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_c
 
@@ -2859,6 +2889,9 @@ unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_c
 
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
+
 void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_c
 
@@ -2868,6 +2901,9 @@ unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_c
 
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
+
 void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_c
 
@@ -2877,6 +2913,9 @@ unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_c
 
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
+
 void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_c
 
@@ -2886,6 +2925,9 @@ unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_c
 
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
+
 void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_c
 
@@ -2895,6 +2937,9 @@ unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_c
 
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
+
 void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_c
 
@@ -2904,6 +2949,9 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
 
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
+
 void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
 
@@ -2913,6 +2961,9 @@ unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_c
 
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
+
 void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_c
 
@@ -2922,6 +2973,9 @@ unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_c
 
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
+
 void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_c
 
@@ -2931,6 +2985,9 @@ unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_c
 
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
+
 void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_c
 
@@ -2940,6 +2997,9 @@ unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_c
 
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
+
 void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_c
 
@@ -2949,6 +3009,9 @@ unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_c
 
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
+
 void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_c
 
@@ -2958,6 +3021,9 @@ unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_c
 
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
+
 void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_c
 
@@ -2967,6 +3033,9 @@ unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_c
 
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
+
 void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_c
 
@@ -3432,12 +3501,12 @@ void aom_ifft4x4_float_c(const float *input, float *temp, float *output);
 void aom_ifft8x8_float_c(const float *input, float *temp, float *output);
 #define aom_ifft8x8_float aom_ifft8x8_float_c
 
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width);
-int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width);
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_col aom_int_pro_col_neon
 
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
-void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_row aom_int_pro_row_neon
 
 void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
@@ -3746,16 +3815,23 @@ unsigned int aom_mse16x16_neon(const uint8_t *src_ptr, int  source_stride, const
 #define aom_mse16x16 aom_mse16x16_neon
 
 unsigned int aom_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_mse16x8 aom_mse16x8_c
+unsigned int aom_mse16x8_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define aom_mse16x8 aom_mse16x8_neon
 
 unsigned int aom_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_mse8x16 aom_mse8x16_c
+unsigned int aom_mse8x16_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define aom_mse8x16 aom_mse8x16_neon
 
 unsigned int aom_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_mse8x8 aom_mse8x8_c
+unsigned int aom_mse8x8_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define aom_mse8x8 aom_mse8x8_neon
+
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+#define aom_mse_16xh_16bit aom_mse_16xh_16bit_c
 
 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
-#define aom_mse_wxh_16bit aom_mse_wxh_16bit_c
+uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
+#define aom_mse_wxh_16bit aom_mse_wxh_16bit_neon
 
 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
 #define aom_mse_wxh_16bit_highbd aom_mse_wxh_16bit_highbd_c
@@ -4063,22 +4139,33 @@ unsigned int aom_sad128x128_neon(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad128x128 aom_sad128x128_neon
 
 unsigned int aom_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad128x128_avg aom_sad128x128_avg_c
+unsigned int aom_sad128x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad128x128_avg aom_sad128x128_avg_neon
+
+void aom_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x128x3d aom_sad128x128x3d_c
 
 void aom_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad128x128x4d aom_sad128x128x4d_c
+void aom_sad128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x128x4d aom_sad128x128x4d_neon
 
 void aom_sad128x128x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad128x128x4d_avg aom_sad128x128x4d_avg_c
 
 unsigned int aom_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad128x64 aom_sad128x64_c
+unsigned int aom_sad128x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad128x64 aom_sad128x64_neon
 
 unsigned int aom_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad128x64_avg aom_sad128x64_avg_c
+unsigned int aom_sad128x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad128x64_avg aom_sad128x64_avg_neon
+
+void aom_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x64x3d aom_sad128x64x3d_c
 
 void aom_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad128x64x4d aom_sad128x64x4d_c
+void aom_sad128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x64x4d aom_sad128x64x4d_neon
 
 void aom_sad128x64x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad128x64x4d_avg aom_sad128x64x4d_avg_c
@@ -4091,7 +4178,11 @@ unsigned int aom_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define aom_sad16x16 aom_sad16x16_neon
 
 unsigned int aom_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x16_avg aom_sad16x16_avg_c
+unsigned int aom_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x16_avg aom_sad16x16_avg_neon
+
+void aom_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x16x3d aom_sad16x16x3d_c
 
 void aom_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -4101,37 +4192,55 @@ void aom_sad16x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 #define aom_sad16x16x4d_avg aom_sad16x16x4d_avg_c
 
 unsigned int aom_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad16x32 aom_sad16x32_c
+unsigned int aom_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad16x32 aom_sad16x32_neon
 
 unsigned int aom_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x32_avg aom_sad16x32_avg_c
+unsigned int aom_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x32_avg aom_sad16x32_avg_neon
+
+void aom_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x32x3d aom_sad16x32x3d_c
 
 void aom_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x32x4d aom_sad16x32x4d_c
+void aom_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x32x4d aom_sad16x32x4d_neon
 
 void aom_sad16x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x32x4d_avg aom_sad16x32x4d_avg_c
 
 unsigned int aom_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad16x4 aom_sad16x4_c
+unsigned int aom_sad16x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad16x4 aom_sad16x4_neon
 
 unsigned int aom_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x4_avg aom_sad16x4_avg_c
+unsigned int aom_sad16x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x4_avg aom_sad16x4_avg_neon
+
+void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x4x3d aom_sad16x4x3d_c
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x4d aom_sad16x4x4d_c
+void aom_sad16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x4x4d aom_sad16x4x4d_neon
 
 void aom_sad16x4x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x4x4d_avg aom_sad16x4x4d_avg_c
 
 unsigned int aom_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad16x64 aom_sad16x64_c
+unsigned int aom_sad16x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad16x64 aom_sad16x64_neon
 
 unsigned int aom_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x64_avg aom_sad16x64_avg_c
+unsigned int aom_sad16x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x64_avg aom_sad16x64_avg_neon
+
+void aom_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x64x3d aom_sad16x64x3d_c
 
 void aom_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x64x4d aom_sad16x64x4d_c
+void aom_sad16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x64x4d aom_sad16x64x4d_neon
 
 void aom_sad16x64x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x64x4d_avg aom_sad16x64x4d_avg_c
@@ -4141,10 +4250,15 @@ unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint
 #define aom_sad16x8 aom_sad16x8_neon
 
 unsigned int aom_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x8_avg aom_sad16x8_avg_c
+unsigned int aom_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x8_avg aom_sad16x8_avg_neon
+
+void aom_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x8x3d aom_sad16x8x3d_c
 
 void aom_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x8x4d aom_sad16x8x4d_c
+void aom_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x8x4d aom_sad16x8x4d_neon
 
 void aom_sad16x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x8x4d_avg aom_sad16x8x4d_avg_c
@@ -4153,13 +4267,19 @@ unsigned int aom_sad16xh_c(const uint8_t *a, int a_stride, const uint8_t *b, int
 #define aom_sad16xh aom_sad16xh_c
 
 unsigned int aom_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad32x16 aom_sad32x16_c
+unsigned int aom_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad32x16 aom_sad32x16_neon
 
 unsigned int aom_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x16_avg aom_sad32x16_avg_c
+unsigned int aom_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x16_avg aom_sad32x16_avg_neon
+
+void aom_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x16x3d aom_sad32x16x3d_c
 
 void aom_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad32x16x4d aom_sad32x16x4d_c
+void aom_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x16x4d aom_sad32x16x4d_neon
 
 void aom_sad32x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad32x16x4d_avg aom_sad32x16x4d_avg_c
@@ -4169,7 +4289,11 @@ unsigned int aom_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define aom_sad32x32 aom_sad32x32_neon
 
 unsigned int aom_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x32_avg aom_sad32x32_avg_c
+unsigned int aom_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x32_avg aom_sad32x32_avg_neon
+
+void aom_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x32x3d aom_sad32x32x3d_c
 
 void aom_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -4179,25 +4303,37 @@ void aom_sad32x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 #define aom_sad32x32x4d_avg aom_sad32x32x4d_avg_c
 
 unsigned int aom_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad32x64 aom_sad32x64_c
+unsigned int aom_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad32x64 aom_sad32x64_neon
 
 unsigned int aom_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x64_avg aom_sad32x64_avg_c
+unsigned int aom_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x64_avg aom_sad32x64_avg_neon
+
+void aom_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x64x3d aom_sad32x64x3d_c
 
 void aom_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad32x64x4d aom_sad32x64x4d_c
+void aom_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x64x4d aom_sad32x64x4d_neon
 
 void aom_sad32x64x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad32x64x4d_avg aom_sad32x64x4d_avg_c
 
 unsigned int aom_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad32x8 aom_sad32x8_c
+unsigned int aom_sad32x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad32x8 aom_sad32x8_neon
 
 unsigned int aom_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x8_avg aom_sad32x8_avg_c
+unsigned int aom_sad32x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x8_avg aom_sad32x8_avg_neon
+
+void aom_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x8x3d aom_sad32x8x3d_c
 
 void aom_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad32x8x4d aom_sad32x8x4d_c
+void aom_sad32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x8x4d aom_sad32x8x4d_neon
 
 void aom_sad32x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad32x8x4d_avg aom_sad32x8x4d_avg_c
@@ -4206,13 +4342,19 @@ unsigned int aom_sad32xh_c(const uint8_t *a, int a_stride, const uint8_t *b, int
 #define aom_sad32xh aom_sad32xh_c
 
 unsigned int aom_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad4x16 aom_sad4x16_c
+unsigned int aom_sad4x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad4x16 aom_sad4x16_neon
 
 unsigned int aom_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad4x16_avg aom_sad4x16_avg_c
+unsigned int aom_sad4x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad4x16_avg aom_sad4x16_avg_neon
+
+void aom_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x16x3d aom_sad4x16x3d_c
 
 void aom_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad4x16x4d aom_sad4x16x4d_c
+void aom_sad4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x16x4d aom_sad4x16x4d_neon
 
 void aom_sad4x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad4x16x4d_avg aom_sad4x16x4d_avg_c
@@ -4222,22 +4364,33 @@ unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8
 #define aom_sad4x4 aom_sad4x4_neon
 
 unsigned int aom_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad4x4_avg aom_sad4x4_avg_c
+unsigned int aom_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad4x4_avg aom_sad4x4_avg_neon
+
+void aom_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x4x3d aom_sad4x4x3d_c
 
 void aom_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad4x4x4d aom_sad4x4x4d_c
+void aom_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x4x4d aom_sad4x4x4d_neon
 
 void aom_sad4x4x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad4x4x4d_avg aom_sad4x4x4d_avg_c
 
 unsigned int aom_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad4x8 aom_sad4x8_c
+unsigned int aom_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad4x8 aom_sad4x8_neon
 
 unsigned int aom_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad4x8_avg aom_sad4x8_avg_c
+unsigned int aom_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad4x8_avg aom_sad4x8_avg_neon
+
+void aom_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x8x3d aom_sad4x8x3d_c
 
 void aom_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad4x8x4d aom_sad4x8x4d_c
+void aom_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x8x4d aom_sad4x8x4d_neon
 
 void aom_sad4x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad4x8x4d_avg aom_sad4x8x4d_avg_c
@@ -4246,37 +4399,55 @@ unsigned int aom_sad4xh_c(const uint8_t *a, int a_stride, const uint8_t *b, int
 #define aom_sad4xh aom_sad4xh_c
 
 unsigned int aom_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad64x128 aom_sad64x128_c
+unsigned int aom_sad64x128_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad64x128 aom_sad64x128_neon
 
 unsigned int aom_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x128_avg aom_sad64x128_avg_c
+unsigned int aom_sad64x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x128_avg aom_sad64x128_avg_neon
+
+void aom_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x128x3d aom_sad64x128x3d_c
 
 void aom_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad64x128x4d aom_sad64x128x4d_c
+void aom_sad64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x128x4d aom_sad64x128x4d_neon
 
 void aom_sad64x128x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad64x128x4d_avg aom_sad64x128x4d_avg_c
 
 unsigned int aom_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad64x16 aom_sad64x16_c
+unsigned int aom_sad64x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad64x16 aom_sad64x16_neon
 
 unsigned int aom_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x16_avg aom_sad64x16_avg_c
+unsigned int aom_sad64x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x16_avg aom_sad64x16_avg_neon
+
+void aom_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x16x3d aom_sad64x16x3d_c
 
 void aom_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad64x16x4d aom_sad64x16x4d_c
+void aom_sad64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x16x4d aom_sad64x16x4d_neon
 
 void aom_sad64x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad64x16x4d_avg aom_sad64x16x4d_avg_c
 
 unsigned int aom_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad64x32 aom_sad64x32_c
+unsigned int aom_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad64x32 aom_sad64x32_neon
 
 unsigned int aom_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x32_avg aom_sad64x32_avg_c
+unsigned int aom_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x32_avg aom_sad64x32_avg_neon
+
+void aom_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x32x3d aom_sad64x32x3d_c
 
 void aom_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad64x32x4d aom_sad64x32x4d_c
+void aom_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x32x4d aom_sad64x32x4d_neon
 
 void aom_sad64x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad64x32x4d_avg aom_sad64x32x4d_avg_c
@@ -4286,7 +4457,11 @@ unsigned int aom_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define aom_sad64x64 aom_sad64x64_neon
 
 unsigned int aom_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x64_avg aom_sad64x64_avg_c
+unsigned int aom_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x64_avg aom_sad64x64_avg_neon
+
+void aom_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x64x3d aom_sad64x64x3d_c
 
 void aom_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -4303,34 +4478,51 @@ unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint
 #define aom_sad8x16 aom_sad8x16_neon
 
 unsigned int aom_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x16_avg aom_sad8x16_avg_c
+unsigned int aom_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x16_avg aom_sad8x16_avg_neon
+
+void aom_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x16x3d aom_sad8x16x3d_c
 
 void aom_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x16x4d aom_sad8x16x4d_c
+void aom_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x16x4d aom_sad8x16x4d_neon
 
 void aom_sad8x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x16x4d_avg aom_sad8x16x4d_avg_c
 
 unsigned int aom_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad8x32 aom_sad8x32_c
+unsigned int aom_sad8x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad8x32 aom_sad8x32_neon
 
 unsigned int aom_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x32_avg aom_sad8x32_avg_c
+unsigned int aom_sad8x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x32_avg aom_sad8x32_avg_neon
+
+void aom_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x32x3d aom_sad8x32x3d_c
 
 void aom_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x32x4d aom_sad8x32x4d_c
+void aom_sad8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x32x4d aom_sad8x32x4d_neon
 
 void aom_sad8x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x32x4d_avg aom_sad8x32x4d_avg_c
 
 unsigned int aom_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad8x4 aom_sad8x4_c
+unsigned int aom_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad8x4 aom_sad8x4_neon
 
 unsigned int aom_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x4_avg aom_sad8x4_avg_c
+unsigned int aom_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x4_avg aom_sad8x4_avg_neon
+
+void aom_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x4x3d aom_sad8x4x3d_c
 
 void aom_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x4x4d aom_sad8x4x4d_c
+void aom_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x4x4d aom_sad8x4x4d_neon
 
 void aom_sad8x4x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x4x4d_avg aom_sad8x4x4d_avg_c
@@ -4340,10 +4532,15 @@ unsigned int aom_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8
 #define aom_sad8x8 aom_sad8x8_neon
 
 unsigned int aom_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x8_avg aom_sad8x8_avg_c
+unsigned int aom_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x8_avg aom_sad8x8_avg_neon
+
+void aom_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x8x3d aom_sad8x8x3d_c
 
 void aom_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x8x4d aom_sad8x8x4d_c
+void aom_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x8x4d aom_sad8x8x4d_neon
 
 void aom_sad8x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x8x4d_avg aom_sad8x8x4d_avg_c
@@ -4769,70 +4966,92 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance128x128 aom_sub_pixel_avg_variance128x128_c
+uint32_t aom_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance128x128 aom_sub_pixel_avg_variance128x128_neon
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance128x64 aom_sub_pixel_avg_variance128x64_c
+uint32_t aom_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance128x64 aom_sub_pixel_avg_variance128x64_neon
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x16 aom_sub_pixel_avg_variance16x16_c
+uint32_t aom_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x16 aom_sub_pixel_avg_variance16x16_neon
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x32 aom_sub_pixel_avg_variance16x32_c
+uint32_t aom_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x32 aom_sub_pixel_avg_variance16x32_neon
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x4 aom_sub_pixel_avg_variance16x4_c
+uint32_t aom_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x4 aom_sub_pixel_avg_variance16x4_neon
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x64 aom_sub_pixel_avg_variance16x64_c
+uint32_t aom_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x64 aom_sub_pixel_avg_variance16x64_neon
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x8 aom_sub_pixel_avg_variance16x8_c
+uint32_t aom_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x8 aom_sub_pixel_avg_variance16x8_neon
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x16 aom_sub_pixel_avg_variance32x16_c
+uint32_t aom_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x16 aom_sub_pixel_avg_variance32x16_neon
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x32 aom_sub_pixel_avg_variance32x32_c
+uint32_t aom_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x32 aom_sub_pixel_avg_variance32x32_neon
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x64 aom_sub_pixel_avg_variance32x64_c
+uint32_t aom_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x64 aom_sub_pixel_avg_variance32x64_neon
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x8 aom_sub_pixel_avg_variance32x8_c
+uint32_t aom_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x8 aom_sub_pixel_avg_variance32x8_neon
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance4x16 aom_sub_pixel_avg_variance4x16_c
+uint32_t aom_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance4x16 aom_sub_pixel_avg_variance4x16_neon
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance4x4 aom_sub_pixel_avg_variance4x4_c
+uint32_t aom_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance4x4 aom_sub_pixel_avg_variance4x4_neon
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance4x8 aom_sub_pixel_avg_variance4x8_c
+uint32_t aom_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance4x8 aom_sub_pixel_avg_variance4x8_neon
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x128 aom_sub_pixel_avg_variance64x128_c
+uint32_t aom_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x128 aom_sub_pixel_avg_variance64x128_neon
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x16 aom_sub_pixel_avg_variance64x16_c
+uint32_t aom_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x16 aom_sub_pixel_avg_variance64x16_neon
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x32 aom_sub_pixel_avg_variance64x32_c
+uint32_t aom_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x32 aom_sub_pixel_avg_variance64x32_neon
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x64 aom_sub_pixel_avg_variance64x64_c
+uint32_t aom_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x64 aom_sub_pixel_avg_variance64x64_neon
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x16 aom_sub_pixel_avg_variance8x16_c
+uint32_t aom_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x16 aom_sub_pixel_avg_variance8x16_neon
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x32 aom_sub_pixel_avg_variance8x32_c
+uint32_t aom_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x32 aom_sub_pixel_avg_variance8x32_neon
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x4 aom_sub_pixel_avg_variance8x4_c
+uint32_t aom_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x4 aom_sub_pixel_avg_variance8x4_neon
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x8 aom_sub_pixel_avg_variance8x8_c
+uint32_t aom_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x8 aom_sub_pixel_avg_variance8x8_neon
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -4934,7 +5153,8 @@ uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t N);
 #define aom_sum_squares_i16 aom_sum_squares_i16_c
 
 uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width, int height, int *sum);
-#define aom_sum_sse_2d_i16 aom_sum_sse_2d_i16_c
+uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int src_stride, int width, int height, int *sum);
+#define aom_sum_sse_2d_i16 aom_sum_sse_2d_i16_neon
 
 void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -5020,10 +5240,12 @@ unsigned int aom_variance16x32_neon(const uint8_t *src_ptr, int source_stride, c
 #define aom_variance16x32 aom_variance16x32_neon
 
 unsigned int aom_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance16x4 aom_variance16x4_c
+unsigned int aom_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance16x4 aom_variance16x4_neon
 
 unsigned int aom_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance16x64 aom_variance16x64_c
+unsigned int aom_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance16x64 aom_variance16x64_neon
 
 unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 unsigned int aom_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -5048,10 +5270,12 @@ unsigned int aom_variance32x64_neon(const uint8_t *src_ptr, int source_stride, c
 #define aom_variance32x64 aom_variance32x64_neon
 
 unsigned int aom_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance32x8 aom_variance32x8_c
+unsigned int aom_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance32x8 aom_variance32x8_neon
 
 unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x16 aom_variance4x16_c
+unsigned int aom_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance4x16 aom_variance4x16_neon
 
 unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define aom_variance4x2 aom_variance4x2_c
@@ -5069,7 +5293,8 @@ unsigned int aom_variance64x128_neon(const uint8_t *src_ptr, int source_stride,
 #define aom_variance64x128 aom_variance64x128_neon
 
 unsigned int aom_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance64x16 aom_variance64x16_c
+unsigned int aom_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance64x16 aom_variance64x16_neon
 
 unsigned int aom_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 unsigned int aom_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -5084,7 +5309,8 @@ unsigned int aom_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
 #define aom_variance8x16 aom_variance8x16_neon
 
 unsigned int aom_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance8x32 aom_variance8x32_c
+unsigned int aom_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance8x32 aom_variance8x32_neon
 
 unsigned int aom_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 unsigned int aom_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -5094,10 +5320,13 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const
 unsigned int aom_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define aom_variance8x8 aom_variance8x8_neon
 
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
-int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl);
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl);
 #define aom_vector_var aom_vector_var_neon
 
+double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
+#define av1_compute_cross_correlation av1_compute_cross_correlation_c
+
 void aom_dsp_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/config/arm/config/aom_scale_rtcd.h b/config/arm/config/aom_scale_rtcd.h
index 067ddb492..df4b96f2a 100644
--- a/config/arm/config/aom_scale_rtcd.h
+++ b/config/arm/config/aom_scale_rtcd.h
@@ -17,6 +17,9 @@ extern "C" {
 void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
+void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
+#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
+
 void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
 #define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
 
diff --git a/config/arm/config/av1_rtcd.h b/config/arm/config/av1_rtcd.h
index 340cbd180..964bb72a7 100644
--- a/config/arm/config/av1_rtcd.h
+++ b/config/arm/config/av1_rtcd.h
@@ -125,7 +125,8 @@ void av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int he
 #define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_neon
 
 void av1_apply_temporal_filter_c(const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-#define av1_apply_temporal_filter av1_apply_temporal_filter_c
+void av1_apply_temporal_filter_neon(const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+#define av1_apply_temporal_filter av1_apply_temporal_filter_neon
 
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -148,10 +149,10 @@ void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE m
 int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
 #define av1_calc_frame_error av1_calc_frame_error_c
 
-void av1_calc_indices_dim1_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
+void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
 #define av1_calc_indices_dim1 av1_calc_indices_dim1_c
 
-void av1_calc_indices_dim2_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
+void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
 #define av1_calc_indices_dim2 av1_calc_indices_dim2_c
 
 void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
@@ -178,9 +179,6 @@ void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int
 bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
 #define av1_cnn_predict av1_cnn_predict_c
 
-double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
 void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
 #define av1_compute_stats av1_compute_stats_c
 
@@ -636,18 +634,19 @@ int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int
 #define av1_wedge_sign_from_residuals av1_wedge_sign_from_residuals_c
 
 uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
-#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_c
+uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
+#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_neon
 
 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_neon
 
-void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_neon
 
-void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_neon
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
diff --git a/config/arm64/config/aom_config.asm b/config/arm64/config/aom_config.asm
index 5a3bb6ad2..0ea9a460c 100644
--- a/config/arm64/config/aom_config.asm
+++ b/config/arm64/config/aom_config.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2022, Alliance for Open Media. All rights reserved
+; Copyright (c) 2023, Alliance for Open Media. All rights reserved
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,7 +9,6 @@
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ARCH_ARM equ 1
-ARCH_MIPS equ 0
 ARCH_PPC equ 0
 ARCH_X86 equ 0
 ARCH_X86_64 equ 0
@@ -67,14 +66,11 @@ CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
 FORCE_HIGHBITDEPTH_DECODING equ 0
+HAVE_ARM_CRC32 equ 0
 HAVE_AVX equ 0
 HAVE_AVX2 equ 0
-HAVE_DSPR2 equ 0
 HAVE_FEXCEPT equ 1
-HAVE_MIPS32 equ 0
-HAVE_MIPS64 equ 0
 HAVE_MMX equ 0
-HAVE_MSA equ 0
 HAVE_NEON equ 1
 HAVE_PTHREAD_H equ 1
 HAVE_SSE equ 0
diff --git a/config/arm64/config/aom_config.h b/config/arm64/config/aom_config.h
index f59508241..616a56fb6 100644
--- a/config/arm64/config/aom_config.h
+++ b/config/arm64/config/aom_config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,7 +11,6 @@
 #ifndef AOM_CONFIG_H_
 #define AOM_CONFIG_H_
 #define ARCH_ARM 1
-#define ARCH_MIPS 0
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
@@ -69,14 +68,11 @@
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
 #define FORCE_HIGHBITDEPTH_DECODING 0
+#define HAVE_ARM_CRC32 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_DSPR2 0
 #define HAVE_FEXCEPT 1
-#define HAVE_MIPS32 0
-#define HAVE_MIPS64 0
 #define HAVE_MMX 0
-#define HAVE_MSA 0
 #define HAVE_NEON 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_SSE 0
diff --git a/config/arm64/config/aom_dsp_rtcd.h b/config/arm64/config/aom_dsp_rtcd.h
index 9348bfe4b..7ae663693 100644
--- a/config/arm64/config/aom_dsp_rtcd.h
+++ b/config/arm64/config/aom_dsp_rtcd.h
@@ -488,9 +488,12 @@ void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, int
 unsigned int aom_get_mb_ss_c(const int16_t *);
 #define aom_get_mb_ss aom_get_mb_ss_c
 
-void aom_get_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void aom_get_sse_sum_8x8_quad_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define aom_get_sse_sum_8x8_quad aom_get_sse_sum_8x8_quad_neon
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
+#define aom_get_var_sse_sum_16x16_dual aom_get_var_sse_sum_16x16_dual_c
+
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+#define aom_get_var_sse_sum_8x8_quad aom_get_var_sse_sum_8x8_quad_neon
 
 void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -2778,6 +2781,9 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
 
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
+
 void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
 
@@ -2787,6 +2793,9 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
 
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
+
 void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
 
@@ -2796,6 +2805,9 @@ unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_c
 
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
+
 void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_c
 
@@ -2805,6 +2817,9 @@ unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_c
 
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
+
 void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_c
 
@@ -2814,6 +2829,9 @@ unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_c
 
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
+
 void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_c
 
@@ -2823,6 +2841,9 @@ unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_c
 
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
+
 void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_c
 
@@ -2832,6 +2853,9 @@ unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_c
 
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
+
 void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_c
 
@@ -2841,6 +2865,9 @@ unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_c
 
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
+
 void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_c
 
@@ -2850,6 +2877,9 @@ unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_c
 
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
+
 void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_c
 
@@ -2859,6 +2889,9 @@ unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_c
 
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
+
 void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_c
 
@@ -2868,6 +2901,9 @@ unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_c
 
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
+
 void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_c
 
@@ -2877,6 +2913,9 @@ unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_c
 
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
+
 void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_c
 
@@ -2886,6 +2925,9 @@ unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_c
 
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
+
 void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_c
 
@@ -2895,6 +2937,9 @@ unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_c
 
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
+
 void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_c
 
@@ -2904,6 +2949,9 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
 
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
+
 void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
 
@@ -2913,6 +2961,9 @@ unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_c
 
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
+
 void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_c
 
@@ -2922,6 +2973,9 @@ unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_c
 
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
+
 void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_c
 
@@ -2931,6 +2985,9 @@ unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_c
 
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
+
 void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_c
 
@@ -2940,6 +2997,9 @@ unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_c
 
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
+
 void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_c
 
@@ -2949,6 +3009,9 @@ unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_c
 
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
+
 void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_c
 
@@ -2958,6 +3021,9 @@ unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_c
 
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
+
 void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_c
 
@@ -2967,6 +3033,9 @@ unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_c
 
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
+
 void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_c
 
@@ -3432,12 +3501,12 @@ void aom_ifft4x4_float_c(const float *input, float *temp, float *output);
 void aom_ifft8x8_float_c(const float *input, float *temp, float *output);
 #define aom_ifft8x8_float aom_ifft8x8_float_c
 
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width);
-int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width);
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_col aom_int_pro_col_neon
 
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
-void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_row aom_int_pro_row_neon
 
 void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
@@ -3746,16 +3815,23 @@ unsigned int aom_mse16x16_neon(const uint8_t *src_ptr, int  source_stride, const
 #define aom_mse16x16 aom_mse16x16_neon
 
 unsigned int aom_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_mse16x8 aom_mse16x8_c
+unsigned int aom_mse16x8_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define aom_mse16x8 aom_mse16x8_neon
 
 unsigned int aom_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_mse8x16 aom_mse8x16_c
+unsigned int aom_mse8x16_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define aom_mse8x16 aom_mse8x16_neon
 
 unsigned int aom_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_mse8x8 aom_mse8x8_c
+unsigned int aom_mse8x8_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define aom_mse8x8 aom_mse8x8_neon
+
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+#define aom_mse_16xh_16bit aom_mse_16xh_16bit_c
 
 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
-#define aom_mse_wxh_16bit aom_mse_wxh_16bit_c
+uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
+#define aom_mse_wxh_16bit aom_mse_wxh_16bit_neon
 
 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
 #define aom_mse_wxh_16bit_highbd aom_mse_wxh_16bit_highbd_c
@@ -4063,22 +4139,33 @@ unsigned int aom_sad128x128_neon(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad128x128 aom_sad128x128_neon
 
 unsigned int aom_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad128x128_avg aom_sad128x128_avg_c
+unsigned int aom_sad128x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad128x128_avg aom_sad128x128_avg_neon
+
+void aom_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x128x3d aom_sad128x128x3d_c
 
 void aom_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad128x128x4d aom_sad128x128x4d_c
+void aom_sad128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x128x4d aom_sad128x128x4d_neon
 
 void aom_sad128x128x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad128x128x4d_avg aom_sad128x128x4d_avg_c
 
 unsigned int aom_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad128x64 aom_sad128x64_c
+unsigned int aom_sad128x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad128x64 aom_sad128x64_neon
 
 unsigned int aom_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad128x64_avg aom_sad128x64_avg_c
+unsigned int aom_sad128x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad128x64_avg aom_sad128x64_avg_neon
+
+void aom_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x64x3d aom_sad128x64x3d_c
 
 void aom_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad128x64x4d aom_sad128x64x4d_c
+void aom_sad128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x64x4d aom_sad128x64x4d_neon
 
 void aom_sad128x64x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad128x64x4d_avg aom_sad128x64x4d_avg_c
@@ -4091,7 +4178,11 @@ unsigned int aom_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define aom_sad16x16 aom_sad16x16_neon
 
 unsigned int aom_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x16_avg aom_sad16x16_avg_c
+unsigned int aom_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x16_avg aom_sad16x16_avg_neon
+
+void aom_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x16x3d aom_sad16x16x3d_c
 
 void aom_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -4101,37 +4192,55 @@ void aom_sad16x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 #define aom_sad16x16x4d_avg aom_sad16x16x4d_avg_c
 
 unsigned int aom_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad16x32 aom_sad16x32_c
+unsigned int aom_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad16x32 aom_sad16x32_neon
 
 unsigned int aom_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x32_avg aom_sad16x32_avg_c
+unsigned int aom_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x32_avg aom_sad16x32_avg_neon
+
+void aom_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x32x3d aom_sad16x32x3d_c
 
 void aom_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x32x4d aom_sad16x32x4d_c
+void aom_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x32x4d aom_sad16x32x4d_neon
 
 void aom_sad16x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x32x4d_avg aom_sad16x32x4d_avg_c
 
 unsigned int aom_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad16x4 aom_sad16x4_c
+unsigned int aom_sad16x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad16x4 aom_sad16x4_neon
 
 unsigned int aom_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x4_avg aom_sad16x4_avg_c
+unsigned int aom_sad16x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x4_avg aom_sad16x4_avg_neon
+
+void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x4x3d aom_sad16x4x3d_c
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x4d aom_sad16x4x4d_c
+void aom_sad16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x4x4d aom_sad16x4x4d_neon
 
 void aom_sad16x4x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x4x4d_avg aom_sad16x4x4d_avg_c
 
 unsigned int aom_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad16x64 aom_sad16x64_c
+unsigned int aom_sad16x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad16x64 aom_sad16x64_neon
 
 unsigned int aom_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x64_avg aom_sad16x64_avg_c
+unsigned int aom_sad16x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x64_avg aom_sad16x64_avg_neon
+
+void aom_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x64x3d aom_sad16x64x3d_c
 
 void aom_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x64x4d aom_sad16x64x4d_c
+void aom_sad16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x64x4d aom_sad16x64x4d_neon
 
 void aom_sad16x64x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x64x4d_avg aom_sad16x64x4d_avg_c
@@ -4141,10 +4250,15 @@ unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint
 #define aom_sad16x8 aom_sad16x8_neon
 
 unsigned int aom_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad16x8_avg aom_sad16x8_avg_c
+unsigned int aom_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad16x8_avg aom_sad16x8_avg_neon
+
+void aom_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x8x3d aom_sad16x8x3d_c
 
 void aom_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x8x4d aom_sad16x8x4d_c
+void aom_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x8x4d aom_sad16x8x4d_neon
 
 void aom_sad16x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad16x8x4d_avg aom_sad16x8x4d_avg_c
@@ -4153,13 +4267,19 @@ unsigned int aom_sad16xh_c(const uint8_t *a, int a_stride, const uint8_t *b, int
 #define aom_sad16xh aom_sad16xh_c
 
 unsigned int aom_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad32x16 aom_sad32x16_c
+unsigned int aom_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad32x16 aom_sad32x16_neon
 
 unsigned int aom_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x16_avg aom_sad32x16_avg_c
+unsigned int aom_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x16_avg aom_sad32x16_avg_neon
+
+void aom_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x16x3d aom_sad32x16x3d_c
 
 void aom_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad32x16x4d aom_sad32x16x4d_c
+void aom_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x16x4d aom_sad32x16x4d_neon
 
 void aom_sad32x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad32x16x4d_avg aom_sad32x16x4d_avg_c
@@ -4169,7 +4289,11 @@ unsigned int aom_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define aom_sad32x32 aom_sad32x32_neon
 
 unsigned int aom_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x32_avg aom_sad32x32_avg_c
+unsigned int aom_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x32_avg aom_sad32x32_avg_neon
+
+void aom_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x32x3d aom_sad32x32x3d_c
 
 void aom_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -4179,25 +4303,37 @@ void aom_sad32x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 #define aom_sad32x32x4d_avg aom_sad32x32x4d_avg_c
 
 unsigned int aom_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad32x64 aom_sad32x64_c
+unsigned int aom_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad32x64 aom_sad32x64_neon
 
 unsigned int aom_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x64_avg aom_sad32x64_avg_c
+unsigned int aom_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x64_avg aom_sad32x64_avg_neon
+
+void aom_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x64x3d aom_sad32x64x3d_c
 
 void aom_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad32x64x4d aom_sad32x64x4d_c
+void aom_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x64x4d aom_sad32x64x4d_neon
 
 void aom_sad32x64x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad32x64x4d_avg aom_sad32x64x4d_avg_c
 
 unsigned int aom_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad32x8 aom_sad32x8_c
+unsigned int aom_sad32x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad32x8 aom_sad32x8_neon
 
 unsigned int aom_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad32x8_avg aom_sad32x8_avg_c
+unsigned int aom_sad32x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad32x8_avg aom_sad32x8_avg_neon
+
+void aom_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x8x3d aom_sad32x8x3d_c
 
 void aom_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad32x8x4d aom_sad32x8x4d_c
+void aom_sad32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x8x4d aom_sad32x8x4d_neon
 
 void aom_sad32x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad32x8x4d_avg aom_sad32x8x4d_avg_c
@@ -4206,13 +4342,19 @@ unsigned int aom_sad32xh_c(const uint8_t *a, int a_stride, const uint8_t *b, int
 #define aom_sad32xh aom_sad32xh_c
 
 unsigned int aom_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad4x16 aom_sad4x16_c
+unsigned int aom_sad4x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad4x16 aom_sad4x16_neon
 
 unsigned int aom_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad4x16_avg aom_sad4x16_avg_c
+unsigned int aom_sad4x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad4x16_avg aom_sad4x16_avg_neon
+
+void aom_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x16x3d aom_sad4x16x3d_c
 
 void aom_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad4x16x4d aom_sad4x16x4d_c
+void aom_sad4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x16x4d aom_sad4x16x4d_neon
 
 void aom_sad4x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad4x16x4d_avg aom_sad4x16x4d_avg_c
@@ -4222,22 +4364,33 @@ unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8
 #define aom_sad4x4 aom_sad4x4_neon
 
 unsigned int aom_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad4x4_avg aom_sad4x4_avg_c
+unsigned int aom_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad4x4_avg aom_sad4x4_avg_neon
+
+void aom_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x4x3d aom_sad4x4x3d_c
 
 void aom_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad4x4x4d aom_sad4x4x4d_c
+void aom_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x4x4d aom_sad4x4x4d_neon
 
 void aom_sad4x4x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad4x4x4d_avg aom_sad4x4x4d_avg_c
 
 unsigned int aom_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad4x8 aom_sad4x8_c
+unsigned int aom_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad4x8 aom_sad4x8_neon
 
 unsigned int aom_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad4x8_avg aom_sad4x8_avg_c
+unsigned int aom_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad4x8_avg aom_sad4x8_avg_neon
+
+void aom_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x8x3d aom_sad4x8x3d_c
 
 void aom_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad4x8x4d aom_sad4x8x4d_c
+void aom_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x8x4d aom_sad4x8x4d_neon
 
 void aom_sad4x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad4x8x4d_avg aom_sad4x8x4d_avg_c
@@ -4246,37 +4399,55 @@ unsigned int aom_sad4xh_c(const uint8_t *a, int a_stride, const uint8_t *b, int
 #define aom_sad4xh aom_sad4xh_c
 
 unsigned int aom_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad64x128 aom_sad64x128_c
+unsigned int aom_sad64x128_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad64x128 aom_sad64x128_neon
 
 unsigned int aom_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x128_avg aom_sad64x128_avg_c
+unsigned int aom_sad64x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x128_avg aom_sad64x128_avg_neon
+
+void aom_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x128x3d aom_sad64x128x3d_c
 
 void aom_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad64x128x4d aom_sad64x128x4d_c
+void aom_sad64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x128x4d aom_sad64x128x4d_neon
 
 void aom_sad64x128x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad64x128x4d_avg aom_sad64x128x4d_avg_c
 
 unsigned int aom_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad64x16 aom_sad64x16_c
+unsigned int aom_sad64x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad64x16 aom_sad64x16_neon
 
 unsigned int aom_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x16_avg aom_sad64x16_avg_c
+unsigned int aom_sad64x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x16_avg aom_sad64x16_avg_neon
+
+void aom_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x16x3d aom_sad64x16x3d_c
 
 void aom_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad64x16x4d aom_sad64x16x4d_c
+void aom_sad64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x16x4d aom_sad64x16x4d_neon
 
 void aom_sad64x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad64x16x4d_avg aom_sad64x16x4d_avg_c
 
 unsigned int aom_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad64x32 aom_sad64x32_c
+unsigned int aom_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad64x32 aom_sad64x32_neon
 
 unsigned int aom_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x32_avg aom_sad64x32_avg_c
+unsigned int aom_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x32_avg aom_sad64x32_avg_neon
+
+void aom_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x32x3d aom_sad64x32x3d_c
 
 void aom_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad64x32x4d aom_sad64x32x4d_c
+void aom_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x32x4d aom_sad64x32x4d_neon
 
 void aom_sad64x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad64x32x4d_avg aom_sad64x32x4d_avg_c
@@ -4286,7 +4457,11 @@ unsigned int aom_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define aom_sad64x64 aom_sad64x64_neon
 
 unsigned int aom_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad64x64_avg aom_sad64x64_avg_c
+unsigned int aom_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad64x64_avg aom_sad64x64_avg_neon
+
+void aom_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x64x3d aom_sad64x64x3d_c
 
 void aom_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -4303,34 +4478,51 @@ unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint
 #define aom_sad8x16 aom_sad8x16_neon
 
 unsigned int aom_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x16_avg aom_sad8x16_avg_c
+unsigned int aom_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x16_avg aom_sad8x16_avg_neon
+
+void aom_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x16x3d aom_sad8x16x3d_c
 
 void aom_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x16x4d aom_sad8x16x4d_c
+void aom_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x16x4d aom_sad8x16x4d_neon
 
 void aom_sad8x16x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x16x4d_avg aom_sad8x16x4d_avg_c
 
 unsigned int aom_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad8x32 aom_sad8x32_c
+unsigned int aom_sad8x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad8x32 aom_sad8x32_neon
 
 unsigned int aom_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x32_avg aom_sad8x32_avg_c
+unsigned int aom_sad8x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x32_avg aom_sad8x32_avg_neon
+
+void aom_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x32x3d aom_sad8x32x3d_c
 
 void aom_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x32x4d aom_sad8x32x4d_c
+void aom_sad8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x32x4d aom_sad8x32x4d_neon
 
 void aom_sad8x32x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x32x4d_avg aom_sad8x32x4d_avg_c
 
 unsigned int aom_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define aom_sad8x4 aom_sad8x4_c
+unsigned int aom_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define aom_sad8x4 aom_sad8x4_neon
 
 unsigned int aom_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x4_avg aom_sad8x4_avg_c
+unsigned int aom_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x4_avg aom_sad8x4_avg_neon
+
+void aom_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x4x3d aom_sad8x4x3d_c
 
 void aom_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x4x4d aom_sad8x4x4d_c
+void aom_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x4x4d aom_sad8x4x4d_neon
 
 void aom_sad8x4x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x4x4d_avg aom_sad8x4x4d_avg_c
@@ -4340,10 +4532,15 @@ unsigned int aom_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8
 #define aom_sad8x8 aom_sad8x8_neon
 
 unsigned int aom_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_sad8x8_avg aom_sad8x8_avg_c
+unsigned int aom_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_sad8x8_avg aom_sad8x8_avg_neon
+
+void aom_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x8x3d aom_sad8x8x3d_c
 
 void aom_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad8x8x4d aom_sad8x8x4d_c
+void aom_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x8x4d aom_sad8x8x4d_neon
 
 void aom_sad8x8x4d_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]);
 #define aom_sad8x8x4d_avg aom_sad8x8x4d_avg_c
@@ -4769,70 +4966,92 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance128x128 aom_sub_pixel_avg_variance128x128_c
+uint32_t aom_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance128x128 aom_sub_pixel_avg_variance128x128_neon
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance128x64 aom_sub_pixel_avg_variance128x64_c
+uint32_t aom_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance128x64 aom_sub_pixel_avg_variance128x64_neon
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x16 aom_sub_pixel_avg_variance16x16_c
+uint32_t aom_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x16 aom_sub_pixel_avg_variance16x16_neon
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x32 aom_sub_pixel_avg_variance16x32_c
+uint32_t aom_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x32 aom_sub_pixel_avg_variance16x32_neon
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x4 aom_sub_pixel_avg_variance16x4_c
+uint32_t aom_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x4 aom_sub_pixel_avg_variance16x4_neon
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x64 aom_sub_pixel_avg_variance16x64_c
+uint32_t aom_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x64 aom_sub_pixel_avg_variance16x64_neon
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance16x8 aom_sub_pixel_avg_variance16x8_c
+uint32_t aom_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance16x8 aom_sub_pixel_avg_variance16x8_neon
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x16 aom_sub_pixel_avg_variance32x16_c
+uint32_t aom_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x16 aom_sub_pixel_avg_variance32x16_neon
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x32 aom_sub_pixel_avg_variance32x32_c
+uint32_t aom_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x32 aom_sub_pixel_avg_variance32x32_neon
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x64 aom_sub_pixel_avg_variance32x64_c
+uint32_t aom_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x64 aom_sub_pixel_avg_variance32x64_neon
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance32x8 aom_sub_pixel_avg_variance32x8_c
+uint32_t aom_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance32x8 aom_sub_pixel_avg_variance32x8_neon
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance4x16 aom_sub_pixel_avg_variance4x16_c
+uint32_t aom_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance4x16 aom_sub_pixel_avg_variance4x16_neon
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance4x4 aom_sub_pixel_avg_variance4x4_c
+uint32_t aom_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance4x4 aom_sub_pixel_avg_variance4x4_neon
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance4x8 aom_sub_pixel_avg_variance4x8_c
+uint32_t aom_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance4x8 aom_sub_pixel_avg_variance4x8_neon
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x128 aom_sub_pixel_avg_variance64x128_c
+uint32_t aom_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x128 aom_sub_pixel_avg_variance64x128_neon
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x16 aom_sub_pixel_avg_variance64x16_c
+uint32_t aom_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x16 aom_sub_pixel_avg_variance64x16_neon
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x32 aom_sub_pixel_avg_variance64x32_c
+uint32_t aom_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x32 aom_sub_pixel_avg_variance64x32_neon
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance64x64 aom_sub_pixel_avg_variance64x64_c
+uint32_t aom_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance64x64 aom_sub_pixel_avg_variance64x64_neon
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x16 aom_sub_pixel_avg_variance8x16_c
+uint32_t aom_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x16 aom_sub_pixel_avg_variance8x16_neon
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x32 aom_sub_pixel_avg_variance8x32_c
+uint32_t aom_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x32 aom_sub_pixel_avg_variance8x32_neon
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x4 aom_sub_pixel_avg_variance8x4_c
+uint32_t aom_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x4 aom_sub_pixel_avg_variance8x4_neon
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_sub_pixel_avg_variance8x8 aom_sub_pixel_avg_variance8x8_c
+uint32_t aom_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_sub_pixel_avg_variance8x8 aom_sub_pixel_avg_variance8x8_neon
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -4934,7 +5153,8 @@ uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t N);
 #define aom_sum_squares_i16 aom_sum_squares_i16_c
 
 uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width, int height, int *sum);
-#define aom_sum_sse_2d_i16 aom_sum_sse_2d_i16_c
+uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int src_stride, int width, int height, int *sum);
+#define aom_sum_sse_2d_i16 aom_sum_sse_2d_i16_neon
 
 void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -5020,10 +5240,12 @@ unsigned int aom_variance16x32_neon(const uint8_t *src_ptr, int source_stride, c
 #define aom_variance16x32 aom_variance16x32_neon
 
 unsigned int aom_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance16x4 aom_variance16x4_c
+unsigned int aom_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance16x4 aom_variance16x4_neon
 
 unsigned int aom_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance16x64 aom_variance16x64_c
+unsigned int aom_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance16x64 aom_variance16x64_neon
 
 unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 unsigned int aom_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -5048,10 +5270,12 @@ unsigned int aom_variance32x64_neon(const uint8_t *src_ptr, int source_stride, c
 #define aom_variance32x64 aom_variance32x64_neon
 
 unsigned int aom_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance32x8 aom_variance32x8_c
+unsigned int aom_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance32x8 aom_variance32x8_neon
 
 unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x16 aom_variance4x16_c
+unsigned int aom_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance4x16 aom_variance4x16_neon
 
 unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define aom_variance4x2 aom_variance4x2_c
@@ -5069,7 +5293,8 @@ unsigned int aom_variance64x128_neon(const uint8_t *src_ptr, int source_stride,
 #define aom_variance64x128 aom_variance64x128_neon
 
 unsigned int aom_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance64x16 aom_variance64x16_c
+unsigned int aom_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance64x16 aom_variance64x16_neon
 
 unsigned int aom_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 unsigned int aom_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -5084,7 +5309,8 @@ unsigned int aom_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
 #define aom_variance8x16 aom_variance8x16_neon
 
 unsigned int aom_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance8x32 aom_variance8x32_c
+unsigned int aom_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define aom_variance8x32 aom_variance8x32_neon
 
 unsigned int aom_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 unsigned int aom_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -5094,10 +5320,13 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const
 unsigned int aom_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define aom_variance8x8 aom_variance8x8_neon
 
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
-int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl);
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl);
 #define aom_vector_var aom_vector_var_neon
 
+double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
+#define av1_compute_cross_correlation av1_compute_cross_correlation_c
+
 void aom_dsp_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/config/arm64/config/aom_scale_rtcd.h b/config/arm64/config/aom_scale_rtcd.h
index 067ddb492..df4b96f2a 100644
--- a/config/arm64/config/aom_scale_rtcd.h
+++ b/config/arm64/config/aom_scale_rtcd.h
@@ -17,6 +17,9 @@ extern "C" {
 void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
+void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
+#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
+
 void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
 #define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
 
diff --git a/config/arm64/config/av1_rtcd.h b/config/arm64/config/av1_rtcd.h
index 340cbd180..964bb72a7 100644
--- a/config/arm64/config/av1_rtcd.h
+++ b/config/arm64/config/av1_rtcd.h
@@ -125,7 +125,8 @@ void av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int he
 #define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_neon
 
 void av1_apply_temporal_filter_c(const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-#define av1_apply_temporal_filter av1_apply_temporal_filter_c
+void av1_apply_temporal_filter_neon(const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+#define av1_apply_temporal_filter av1_apply_temporal_filter_neon
 
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -148,10 +149,10 @@ void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE m
 int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
 #define av1_calc_frame_error av1_calc_frame_error_c
 
-void av1_calc_indices_dim1_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
+void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
 #define av1_calc_indices_dim1 av1_calc_indices_dim1_c
 
-void av1_calc_indices_dim2_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
+void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
 #define av1_calc_indices_dim2 av1_calc_indices_dim2_c
 
 void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
@@ -178,9 +179,6 @@ void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int
 bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
 #define av1_cnn_predict av1_cnn_predict_c
 
-double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
 void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
 #define av1_compute_stats av1_compute_stats_c
 
@@ -636,18 +634,19 @@ int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int
 #define av1_wedge_sign_from_residuals av1_wedge_sign_from_residuals_c
 
 uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
-#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_c
+uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
+#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_neon
 
 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_neon
 
-void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_neon
 
-void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_neon
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
diff --git a/config/config/aom_version.h b/config/config/aom_version.h
index ac31515e4..887ac9682 100644
--- a/config/config/aom_version.h
+++ b/config/config/aom_version.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -10,10 +10,10 @@
  */
 
 #define VERSION_MAJOR 3
-#define VERSION_MINOR 5
+#define VERSION_MINOR 6
 #define VERSION_PATCH 0
-#define VERSION_EXTRA ""
+#define VERSION_EXTRA "210-g1129f499c7"
 #define VERSION_PACKED \
   ((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))
-#define VERSION_STRING_NOSP "v3.5.0"
-#define VERSION_STRING " v3.5.0"
+#define VERSION_STRING_NOSP "3.6.0-210-g1129f499c7"
+#define VERSION_STRING " 3.6.0-210-g1129f499c7"
diff --git a/config/x86/config/aom_config.asm b/config/x86/config/aom_config.asm
index ea7a0b5f7..987b2781b 100644
--- a/config/x86/config/aom_config.asm
+++ b/config/x86/config/aom_config.asm
@@ -1,5 +1,4 @@
 %define ARCH_ARM 0
-%define ARCH_MIPS 0
 %define ARCH_PPC 0
 %define ARCH_X86 1
 %define ARCH_X86_64 0
@@ -57,14 +56,11 @@
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
 %define FORCE_HIGHBITDEPTH_DECODING 0
+%define HAVE_ARM_CRC32 0
 %define HAVE_AVX 0
 %define HAVE_AVX2 0
-%define HAVE_DSPR2 0
 %define HAVE_FEXCEPT 1
-%define HAVE_MIPS32 0
-%define HAVE_MIPS64 0
 %define HAVE_MMX 1
-%define HAVE_MSA 0
 %define HAVE_NEON 0
 %define HAVE_PTHREAD_H 1
 %define HAVE_SSE 1
diff --git a/config/x86/config/aom_config.h b/config/x86/config/aom_config.h
index f553b5a35..d5f6176e2 100644
--- a/config/x86/config/aom_config.h
+++ b/config/x86/config/aom_config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,7 +11,6 @@
 #ifndef AOM_CONFIG_H_
 #define AOM_CONFIG_H_
 #define ARCH_ARM 0
-#define ARCH_MIPS 0
 #define ARCH_PPC 0
 #define ARCH_X86 1
 #define ARCH_X86_64 0
@@ -69,14 +68,11 @@
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
 #define FORCE_HIGHBITDEPTH_DECODING 0
+#define HAVE_ARM_CRC32 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_DSPR2 0
 #define HAVE_FEXCEPT 1
-#define HAVE_MIPS32 0
-#define HAVE_MIPS64 0
 #define HAVE_MMX 1
-#define HAVE_MSA 0
 #define HAVE_NEON 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SSE 1
diff --git a/config/x86/config/aom_dsp_rtcd.h b/config/x86/config/aom_dsp_rtcd.h
index efdcb41d8..a259b8fbd 100644
--- a/config/x86/config/aom_dsp_rtcd.h
+++ b/config/x86/config/aom_dsp_rtcd.h
@@ -600,9 +600,12 @@ unsigned int aom_get_mb_ss_c(const int16_t *);
 unsigned int aom_get_mb_ss_sse2(const int16_t *);
 #define aom_get_mb_ss aom_get_mb_ss_sse2
 
-void aom_get_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void aom_get_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define aom_get_sse_sum_8x8_quad aom_get_sse_sum_8x8_quad_sse2
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
+#define aom_get_var_sse_sum_16x16_dual aom_get_var_sse_sum_16x16_dual_c
+
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+#define aom_get_var_sse_sum_8x8_quad aom_get_var_sse_sum_8x8_quad_sse2
 
 void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -3166,6 +3169,9 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
 
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
+
 void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
 
@@ -3175,6 +3181,9 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
 
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
+
 void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
 
@@ -3186,6 +3195,9 @@ unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_sse2
 
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
+
 void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_sse2
@@ -3198,6 +3210,9 @@ unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_sse2
 
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
+
 void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_sse2
@@ -3210,6 +3225,9 @@ unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_sse2
 
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
+
 void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_sse2
@@ -3222,6 +3240,9 @@ unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad16x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_sse2
 
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
+
 void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_sse2
@@ -3234,6 +3255,9 @@ unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_sse2
 
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
+
 void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_sse2
@@ -3246,6 +3270,9 @@ unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_sse2
 
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
+
 void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_sse2
@@ -3258,6 +3285,9 @@ unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_sse2
 
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
+
 void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_sse2
@@ -3270,6 +3300,9 @@ unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_sse2
 
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
+
 void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_sse2
@@ -3282,6 +3315,9 @@ unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad32x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_sse2
 
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
+
 void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_sse2
@@ -3294,6 +3330,9 @@ unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad4x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_sse2
 
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
+
 void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_sse2
@@ -3306,6 +3345,9 @@ unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_sse2
 
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
+
 void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_sse2
@@ -3318,6 +3360,9 @@ unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_sse2
 
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
+
 void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_sse2
@@ -3328,6 +3373,9 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
 
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
+
 void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
 
@@ -3339,6 +3387,9 @@ unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad64x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_sse2
 
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
+
 void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_sse2
@@ -3351,6 +3402,9 @@ unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_sse2
 
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
+
 void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_sse2
@@ -3363,6 +3417,9 @@ unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_sse2
 
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
+
 void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_sse2
@@ -3375,6 +3432,9 @@ unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_sse2
 
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
+
 void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_sse2
@@ -3387,6 +3447,9 @@ unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad8x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_sse2
 
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
+
 void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_sse2
@@ -3399,6 +3462,9 @@ unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_sse2
 
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
+
 void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_sse2
@@ -3411,6 +3477,9 @@ unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_sse2
 
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
+
 void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_sse2
@@ -3846,12 +3915,12 @@ void aom_ifft8x8_float_c(const float *input, float *temp, float *output);
 void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output);
 #define aom_ifft8x8_float aom_ifft8x8_float_sse2
 
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width);
-int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width);
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_col aom_int_pro_col_sse2
 
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
-void aom_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_row aom_int_pro_row_sse2
 
 void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
@@ -4237,6 +4306,10 @@ unsigned int aom_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint
 unsigned int aom_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 #define aom_mse8x8 aom_mse8x8_sse2
 
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+#define aom_mse_16xh_16bit aom_mse_16xh_16bit_sse2
+
 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
 uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
 #define aom_mse_wxh_16bit aom_mse_wxh_16bit_sse2
@@ -4554,6 +4627,9 @@ unsigned int aom_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_sad128x128_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad128x128_avg aom_sad128x128_avg_sse2
 
+void aom_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x128x3d aom_sad128x128x3d_c
+
 void aom_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad128x128x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad128x128x4d aom_sad128x128x4d_sse2
@@ -4570,6 +4646,9 @@ unsigned int aom_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_sad128x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad128x64_avg aom_sad128x64_avg_sse2
 
+void aom_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x64x3d aom_sad128x64x3d_c
+
 void aom_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad128x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad128x64x4d aom_sad128x64x4d_sse2
@@ -4590,6 +4669,9 @@ unsigned int aom_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x16_avg aom_sad16x16_avg_sse2
 
+void aom_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x16x3d aom_sad16x16x3d_c
+
 void aom_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x16x4d aom_sad16x16x4d_sse2
@@ -4606,6 +4688,9 @@ unsigned int aom_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x32_avg aom_sad16x32_avg_sse2
 
+void aom_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x32x3d aom_sad16x32x3d_c
+
 void aom_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x32x4d aom_sad16x32x4d_sse2
@@ -4622,6 +4707,9 @@ unsigned int aom_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x4_avg aom_sad16x4_avg_sse2
 
+void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x4x3d aom_sad16x4x3d_c
+
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x4x4d aom_sad16x4x4d_sse2
@@ -4638,6 +4726,9 @@ unsigned int aom_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad16x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x64_avg aom_sad16x64_avg_sse2
 
+void aom_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x64x3d aom_sad16x64x3d_c
+
 void aom_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x64x4d aom_sad16x64x4d_sse2
@@ -4654,6 +4745,9 @@ unsigned int aom_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x8_avg aom_sad16x8_avg_sse2
 
+void aom_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x8x3d aom_sad16x8x3d_c
+
 void aom_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x8x4d aom_sad16x8x4d_sse2
@@ -4674,6 +4768,9 @@ unsigned int aom_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x16_avg aom_sad32x16_avg_sse2
 
+void aom_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x16x3d aom_sad32x16x3d_c
+
 void aom_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x16x4d aom_sad32x16x4d_sse2
@@ -4690,6 +4787,9 @@ unsigned int aom_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x32_avg aom_sad32x32_avg_sse2
 
+void aom_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x32x3d aom_sad32x32x3d_c
+
 void aom_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x32x4d aom_sad32x32x4d_sse2
@@ -4706,6 +4806,9 @@ unsigned int aom_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x64_avg aom_sad32x64_avg_sse2
 
+void aom_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x64x3d aom_sad32x64x3d_c
+
 void aom_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x64x4d aom_sad32x64x4d_sse2
@@ -4722,6 +4825,9 @@ unsigned int aom_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad32x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x8_avg aom_sad32x8_avg_sse2
 
+void aom_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x8x3d aom_sad32x8x3d_c
+
 void aom_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x8x4d aom_sad32x8x4d_sse2
@@ -4742,6 +4848,9 @@ unsigned int aom_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad4x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad4x16_avg aom_sad4x16_avg_sse2
 
+void aom_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x16x3d aom_sad4x16x3d_c
+
 void aom_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad4x16x4d aom_sad4x16x4d_sse2
@@ -4758,6 +4867,9 @@ unsigned int aom_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad4x4_avg aom_sad4x4_avg_sse2
 
+void aom_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x4x3d aom_sad4x4x3d_c
+
 void aom_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad4x4x4d aom_sad4x4x4d_sse2
@@ -4774,6 +4886,9 @@ unsigned int aom_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad4x8_avg aom_sad4x8_avg_sse2
 
+void aom_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x8x3d aom_sad4x8x3d_c
+
 void aom_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad4x8x4d aom_sad4x8x4d_sse2
@@ -4794,6 +4909,9 @@ unsigned int aom_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_sad64x128_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x128_avg aom_sad64x128_avg_sse2
 
+void aom_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x128x3d aom_sad64x128x3d_c
+
 void aom_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x128x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x128x4d aom_sad64x128x4d_sse2
@@ -4810,6 +4928,9 @@ unsigned int aom_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad64x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x16_avg aom_sad64x16_avg_sse2
 
+void aom_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x16x3d aom_sad64x16x3d_c
+
 void aom_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x16x4d aom_sad64x16x4d_sse2
@@ -4826,6 +4947,9 @@ unsigned int aom_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x32_avg aom_sad64x32_avg_sse2
 
+void aom_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x32x3d aom_sad64x32x3d_c
+
 void aom_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x32x4d aom_sad64x32x4d_sse2
@@ -4842,6 +4966,9 @@ unsigned int aom_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x64_avg aom_sad64x64_avg_sse2
 
+void aom_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x64x3d aom_sad64x64x3d_c
+
 void aom_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x64x4d aom_sad64x64x4d_sse2
@@ -4862,6 +4989,9 @@ unsigned int aom_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x16_avg aom_sad8x16_avg_sse2
 
+void aom_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x16x3d aom_sad8x16x3d_c
+
 void aom_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x16x4d aom_sad8x16x4d_sse2
@@ -4878,6 +5008,9 @@ unsigned int aom_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad8x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x32_avg aom_sad8x32_avg_sse2
 
+void aom_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x32x3d aom_sad8x32x3d_c
+
 void aom_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x32x4d aom_sad8x32x4d_sse2
@@ -4894,6 +5027,9 @@ unsigned int aom_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x4_avg aom_sad8x4_avg_sse2
 
+void aom_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x4x3d aom_sad8x4x3d_c
+
 void aom_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x4x4d aom_sad8x4x4d_sse2
@@ -4910,6 +5046,9 @@ unsigned int aom_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x8_avg aom_sad8x8_avg_sse2
 
+void aom_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x8x3d aom_sad8x8x3d_c
+
 void aom_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x8x4d aom_sad8x8x4d_sse2
@@ -5755,9 +5894,12 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const
 unsigned int aom_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define aom_variance8x8 aom_variance8x8_sse2
 
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
 #define aom_vector_var aom_vector_var_c
 
+double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
+#define av1_compute_cross_correlation av1_compute_cross_correlation_c
+
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86/config/aom_scale_rtcd.h b/config/x86/config/aom_scale_rtcd.h
index 65c184b83..28e903d6c 100644
--- a/config/x86/config/aom_scale_rtcd.h
+++ b/config/x86/config/aom_scale_rtcd.h
@@ -17,6 +17,9 @@ extern "C" {
 void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
+void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
+#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
+
 void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
 #define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
 
diff --git a/config/x86/config/av1_rtcd.h b/config/x86/config/av1_rtcd.h
index 663cc8136..ef17ccb55 100644
--- a/config/x86/config/av1_rtcd.h
+++ b/config/x86/config/av1_rtcd.h
@@ -174,12 +174,13 @@ int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8
 int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
 #define av1_calc_frame_error av1_calc_frame_error_sse2
 
-void av1_calc_indices_dim1_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
-void av1_calc_indices_dim1_sse2(const int *data, const int *centroids, uint8_t *indices, int n, int k);
+void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
+void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
 #define av1_calc_indices_dim1 av1_calc_indices_dim1_sse2
 
-void av1_calc_indices_dim2_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
-#define av1_calc_indices_dim2 av1_calc_indices_dim2_c
+void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
+void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
+#define av1_calc_indices_dim2 av1_calc_indices_dim2_sse2
 
 void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
 #define av1_calc_proj_params av1_calc_proj_params_c
@@ -205,9 +206,6 @@ void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int
 bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
 #define av1_cnn_predict av1_cnn_predict_c
 
-double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
 void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
 #define av1_compute_stats av1_compute_stats_c
 
@@ -611,14 +609,14 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uin
 void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_sse2
 
-void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_ssse3
 
-void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_ssse3
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
diff --git a/config/x86_64/config/aom_config.asm b/config/x86_64/config/aom_config.asm
index 83ac5528f..135882afb 100644
--- a/config/x86_64/config/aom_config.asm
+++ b/config/x86_64/config/aom_config.asm
@@ -1,5 +1,4 @@
 %define ARCH_ARM 0
-%define ARCH_MIPS 0
 %define ARCH_PPC 0
 %define ARCH_X86 0
 %define ARCH_X86_64 1
@@ -57,14 +56,11 @@
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
 %define FORCE_HIGHBITDEPTH_DECODING 0
+%define HAVE_ARM_CRC32 0
 %define HAVE_AVX 0
 %define HAVE_AVX2 0
-%define HAVE_DSPR2 0
 %define HAVE_FEXCEPT 1
-%define HAVE_MIPS32 0
-%define HAVE_MIPS64 0
 %define HAVE_MMX 1
-%define HAVE_MSA 0
 %define HAVE_NEON 0
 %define HAVE_PTHREAD_H 1
 %define HAVE_SSE 1
diff --git a/config/x86_64/config/aom_config.h b/config/x86_64/config/aom_config.h
index 7e86ec9ee..0b2318620 100644
--- a/config/x86_64/config/aom_config.h
+++ b/config/x86_64/config/aom_config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,7 +11,6 @@
 #ifndef AOM_CONFIG_H_
 #define AOM_CONFIG_H_
 #define ARCH_ARM 0
-#define ARCH_MIPS 0
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 1
@@ -69,14 +68,11 @@
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
 #define FORCE_HIGHBITDEPTH_DECODING 0
+#define HAVE_ARM_CRC32 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_DSPR2 0
 #define HAVE_FEXCEPT 1
-#define HAVE_MIPS32 0
-#define HAVE_MIPS64 0
 #define HAVE_MMX 1
-#define HAVE_MSA 0
 #define HAVE_NEON 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SSE 1
diff --git a/config/x86_64/config/aom_dsp_rtcd.h b/config/x86_64/config/aom_dsp_rtcd.h
index 8634a71e7..c4f99d623 100644
--- a/config/x86_64/config/aom_dsp_rtcd.h
+++ b/config/x86_64/config/aom_dsp_rtcd.h
@@ -601,9 +601,12 @@ unsigned int aom_get_mb_ss_c(const int16_t *);
 unsigned int aom_get_mb_ss_sse2(const int16_t *);
 #define aom_get_mb_ss aom_get_mb_ss_sse2
 
-void aom_get_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void aom_get_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define aom_get_sse_sum_8x8_quad aom_get_sse_sum_8x8_quad_sse2
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
+#define aom_get_var_sse_sum_16x16_dual aom_get_var_sse_sum_16x16_dual_c
+
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8);
+#define aom_get_var_sse_sum_8x8_quad aom_get_var_sse_sum_8x8_quad_sse2
 
 void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -3167,6 +3170,9 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
 
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
+
 void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
 
@@ -3176,6 +3182,9 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
 
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
+
 void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
 
@@ -3187,6 +3196,9 @@ unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_sse2
 
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
+
 void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_sse2
@@ -3199,6 +3211,9 @@ unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_sse2
 
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
+
 void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_sse2
@@ -3211,6 +3226,9 @@ unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_sse2
 
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
+
 void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_sse2
@@ -3223,6 +3241,9 @@ unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad16x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_sse2
 
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
+
 void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_sse2
@@ -3235,6 +3256,9 @@ unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_sse2
 
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
+
 void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_sse2
@@ -3247,6 +3271,9 @@ unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_sse2
 
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
+
 void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_sse2
@@ -3259,6 +3286,9 @@ unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_sse2
 
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
+
 void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_sse2
@@ -3271,6 +3301,9 @@ unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_sse2
 
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
+
 void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_sse2
@@ -3283,6 +3316,9 @@ unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad32x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_sse2
 
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
+
 void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_sse2
@@ -3295,6 +3331,9 @@ unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad4x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_sse2
 
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
+
 void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_sse2
@@ -3307,6 +3346,9 @@ unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_sse2
 
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
+
 void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_sse2
@@ -3319,6 +3361,9 @@ unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_sse2
 
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
+
 void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_sse2
@@ -3329,6 +3374,9 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
 unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
 
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
+
 void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
 
@@ -3340,6 +3388,9 @@ unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad64x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_sse2
 
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
+
 void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_sse2
@@ -3352,6 +3403,9 @@ unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_sse2
 
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
+
 void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_sse2
@@ -3364,6 +3418,9 @@ unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int aom_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_sse2
 
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
+
 void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_sse2
@@ -3376,6 +3433,9 @@ unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_sse2
 
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
+
 void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_sse2
@@ -3388,6 +3448,9 @@ unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int aom_highbd_sad8x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_sse2
 
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
+
 void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_sse2
@@ -3400,6 +3463,9 @@ unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_sse2
 
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
+
 void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_sse2
@@ -3412,6 +3478,9 @@ unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int aom_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_sse2
 
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
+
 void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void aom_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_sse2
@@ -3847,12 +3916,12 @@ void aom_ifft8x8_float_c(const float *input, float *temp, float *output);
 void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output);
 #define aom_ifft8x8_float aom_ifft8x8_float_sse2
 
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width);
-int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width);
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_col aom_int_pro_col_sse2
 
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
-void aom_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor);
 #define aom_int_pro_row aom_int_pro_row_sse2
 
 void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
@@ -4238,6 +4307,10 @@ unsigned int aom_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint
 unsigned int aom_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 #define aom_mse8x8 aom_mse8x8_sse2
 
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+#define aom_mse_16xh_16bit aom_mse_16xh_16bit_sse2
+
 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
 uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
 #define aom_mse_wxh_16bit aom_mse_wxh_16bit_sse2
@@ -4557,6 +4630,9 @@ unsigned int aom_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int aom_sad128x128_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad128x128_avg aom_sad128x128_avg_sse2
 
+void aom_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x128x3d aom_sad128x128x3d_c
+
 void aom_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad128x128x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad128x128x4d aom_sad128x128x4d_sse2
@@ -4573,6 +4649,9 @@ unsigned int aom_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_sad128x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad128x64_avg aom_sad128x64_avg_sse2
 
+void aom_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad128x64x3d aom_sad128x64x3d_c
+
 void aom_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad128x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad128x64x4d aom_sad128x64x4d_sse2
@@ -4593,6 +4672,9 @@ unsigned int aom_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x16_avg aom_sad16x16_avg_sse2
 
+void aom_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x16x3d aom_sad16x16x3d_c
+
 void aom_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x16x4d aom_sad16x16x4d_sse2
@@ -4609,6 +4691,9 @@ unsigned int aom_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x32_avg aom_sad16x32_avg_sse2
 
+void aom_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x32x3d aom_sad16x32x3d_c
+
 void aom_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x32x4d aom_sad16x32x4d_sse2
@@ -4625,6 +4710,9 @@ unsigned int aom_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x4_avg aom_sad16x4_avg_sse2
 
+void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x4x3d aom_sad16x4x3d_c
+
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x4x4d aom_sad16x4x4d_sse2
@@ -4641,6 +4729,9 @@ unsigned int aom_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad16x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x64_avg aom_sad16x64_avg_sse2
 
+void aom_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x64x3d aom_sad16x64x3d_c
+
 void aom_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x64x4d aom_sad16x64x4d_sse2
@@ -4657,6 +4748,9 @@ unsigned int aom_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad16x8_avg aom_sad16x8_avg_sse2
 
+void aom_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad16x8x3d aom_sad16x8x3d_c
+
 void aom_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad16x8x4d aom_sad16x8x4d_sse2
@@ -4677,6 +4771,9 @@ unsigned int aom_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x16_avg aom_sad32x16_avg_sse2
 
+void aom_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x16x3d aom_sad32x16x3d_c
+
 void aom_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x16x4d aom_sad32x16x4d_sse2
@@ -4693,6 +4790,9 @@ unsigned int aom_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x32_avg aom_sad32x32_avg_sse2
 
+void aom_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x32x3d aom_sad32x32x3d_c
+
 void aom_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x32x4d aom_sad32x32x4d_sse2
@@ -4709,6 +4809,9 @@ unsigned int aom_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x64_avg aom_sad32x64_avg_sse2
 
+void aom_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x64x3d aom_sad32x64x3d_c
+
 void aom_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x64x4d aom_sad32x64x4d_sse2
@@ -4725,6 +4828,9 @@ unsigned int aom_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad32x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad32x8_avg aom_sad32x8_avg_sse2
 
+void aom_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad32x8x3d aom_sad32x8x3d_c
+
 void aom_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad32x8x4d aom_sad32x8x4d_sse2
@@ -4745,6 +4851,9 @@ unsigned int aom_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad4x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad4x16_avg aom_sad4x16_avg_sse2
 
+void aom_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x16x3d aom_sad4x16x3d_c
+
 void aom_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad4x16x4d aom_sad4x16x4d_sse2
@@ -4761,6 +4870,9 @@ unsigned int aom_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad4x4_avg aom_sad4x4_avg_sse2
 
+void aom_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x4x3d aom_sad4x4x3d_c
+
 void aom_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad4x4x4d aom_sad4x4x4d_sse2
@@ -4777,6 +4889,9 @@ unsigned int aom_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad4x8_avg aom_sad4x8_avg_sse2
 
+void aom_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad4x8x3d aom_sad4x8x3d_c
+
 void aom_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad4x8x4d aom_sad4x8x4d_sse2
@@ -4797,6 +4912,9 @@ unsigned int aom_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int aom_sad64x128_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x128_avg aom_sad64x128_avg_sse2
 
+void aom_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x128x3d aom_sad64x128x3d_c
+
 void aom_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x128x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x128x4d aom_sad64x128x4d_sse2
@@ -4813,6 +4931,9 @@ unsigned int aom_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad64x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x16_avg aom_sad64x16_avg_sse2
 
+void aom_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x16x3d aom_sad64x16x3d_c
+
 void aom_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x16x4d aom_sad64x16x4d_sse2
@@ -4829,6 +4950,9 @@ unsigned int aom_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x32_avg aom_sad64x32_avg_sse2
 
+void aom_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x32x3d aom_sad64x32x3d_c
+
 void aom_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x32x4d aom_sad64x32x4d_sse2
@@ -4845,6 +4969,9 @@ unsigned int aom_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int aom_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad64x64_avg aom_sad64x64_avg_sse2
 
+void aom_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad64x64x3d aom_sad64x64x3d_c
+
 void aom_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad64x64x4d aom_sad64x64x4d_sse2
@@ -4865,6 +4992,9 @@ unsigned int aom_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x16_avg aom_sad8x16_avg_sse2
 
+void aom_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x16x3d aom_sad8x16x3d_c
+
 void aom_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x16x4d aom_sad8x16x4d_sse2
@@ -4881,6 +5011,9 @@ unsigned int aom_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int aom_sad8x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x32_avg aom_sad8x32_avg_sse2
 
+void aom_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x32x3d aom_sad8x32x3d_c
+
 void aom_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x32x4d aom_sad8x32x4d_sse2
@@ -4897,6 +5030,9 @@ unsigned int aom_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x4_avg aom_sad8x4_avg_sse2
 
+void aom_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x4x3d aom_sad8x4x3d_c
+
 void aom_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x4x4d aom_sad8x4x4d_sse2
@@ -4913,6 +5049,9 @@ unsigned int aom_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int aom_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define aom_sad8x8_avg aom_sad8x8_avg_sse2
 
+void aom_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_sad8x8x3d aom_sad8x8x3d_c
+
 void aom_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 #define aom_sad8x8x4d aom_sad8x8x4d_sse2
@@ -5759,9 +5898,12 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const
 unsigned int aom_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define aom_variance8x8 aom_variance8x8_sse2
 
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
 #define aom_vector_var aom_vector_var_c
 
+double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
+#define av1_compute_cross_correlation av1_compute_cross_correlation_c
+
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86_64/config/aom_scale_rtcd.h b/config/x86_64/config/aom_scale_rtcd.h
index 65c184b83..28e903d6c 100644
--- a/config/x86_64/config/aom_scale_rtcd.h
+++ b/config/x86_64/config/aom_scale_rtcd.h
@@ -17,6 +17,9 @@ extern "C" {
 void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
+void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
+#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
+
 void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
 #define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
 
diff --git a/config/x86_64/config/av1_rtcd.h b/config/x86_64/config/av1_rtcd.h
index 1dc26c908..00a607d6b 100644
--- a/config/x86_64/config/av1_rtcd.h
+++ b/config/x86_64/config/av1_rtcd.h
@@ -174,12 +174,13 @@ int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8
 int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
 #define av1_calc_frame_error av1_calc_frame_error_sse2
 
-void av1_calc_indices_dim1_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
-void av1_calc_indices_dim1_sse2(const int *data, const int *centroids, uint8_t *indices, int n, int k);
+void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
+void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
 #define av1_calc_indices_dim1 av1_calc_indices_dim1_sse2
 
-void av1_calc_indices_dim2_c(const int *data, const int *centroids, uint8_t *indices, int n, int k);
-#define av1_calc_indices_dim2 av1_calc_indices_dim2_c
+void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
+void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
+#define av1_calc_indices_dim2 av1_calc_indices_dim2_sse2
 
 void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
 #define av1_calc_proj_params av1_calc_proj_params_c
@@ -205,9 +206,6 @@ void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int
 bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
 #define av1_cnn_predict av1_cnn_predict_c
 
-double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
 void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
 #define av1_compute_stats av1_compute_stats_c
 
@@ -614,14 +612,14 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uin
 void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_sse2
 
-void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_ssse3
 
-void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 #define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_ssse3
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
diff --git a/examples/set_maps.c b/examples/set_maps.c
index bcb28a063..2593faba3 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -15,7 +15,7 @@
 // This is an example demonstrating how to control the AOM encoder's
 // ROI and Active maps.
 //
-// ROI (Reigon of Interest) maps are a way for the application to assign
+// ROI (Region of Interest) maps are a way for the application to assign
 // each macroblock in the image to a region, and then set quantizer and
 // filtering parameters on that image.
 //
@@ -27,12 +27,12 @@
 // Configuration
 // -------------
 // An ROI map is set on frame 22. If the width of the image in macroblocks
-// is evenly divisble by 4, then the output will appear to have distinct
+// is evenly divisible by 4, then the output will appear to have distinct
 // columns, where the quantizer, loopfilter, and static threshold differ
 // from column to column.
 //
 // An active map is set on frame 33. If the width of the image in macroblocks
-// is evenly divisble by 4, then the output will appear to have distinct
+// is evenly divisible by 4, then the output will appear to have distinct
 // columns, where one column will have motion and the next will not.
 //
 // The active map is cleared on frame 44.
diff --git a/examples/svc_encoder_rtc.c b/examples/svc_encoder_rtc.c
index 7629a1b64..bceb7d235 100644
--- a/examples/svc_encoder_rtc.c
+++ b/examples/svc_encoder_rtc.c
@@ -37,6 +37,8 @@ typedef struct {
   int aq_mode;
   int layering_mode;
   int output_obu;
+  int decode;
+  int tune_content;
 } AppInput;
 
 typedef enum {
@@ -87,6 +89,17 @@ static const arg_def_t error_resilient_arg =
 static const arg_def_t output_obu_arg =
     ARG_DEF(NULL, "output-obu", 1,
             "Write OBUs when set to 1. Otherwise write IVF files.");
+static const arg_def_t test_decode_arg =
+    ARG_DEF(NULL, "test-decode", 1,
+            "Attempt to test decoding the output when set to 1. Default is 1.");
+static const struct arg_enum_list tune_content_enum[] = {
+  { "default", AOM_CONTENT_DEFAULT },
+  { "screen", AOM_CONTENT_SCREEN },
+  { "film", AOM_CONTENT_FILM },
+  { NULL, 0 }
+};
+static const arg_def_t tune_content_arg = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
 
 #if CONFIG_AV1_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -97,18 +110,32 @@ static const arg_def_t bitdepth_arg = ARG_DEF_ENUM(
     "d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ", bitdepth_enum);
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static const arg_def_t *svc_args[] = {
-  &frames_arg,          &outputfile,     &width_arg,
-  &height_arg,          &timebase_arg,   &bitrate_arg,
-  &spatial_layers_arg,  &kf_dist_arg,    &scale_factors_arg,
-  &min_q_arg,           &max_q_arg,      &temporal_layers_arg,
-  &layering_mode_arg,   &threads_arg,    &aqmode_arg,
+static const arg_def_t *svc_args[] = { &frames_arg,
+                                       &outputfile,
+                                       &width_arg,
+                                       &height_arg,
+                                       &timebase_arg,
+                                       &bitrate_arg,
+                                       &spatial_layers_arg,
+                                       &kf_dist_arg,
+                                       &scale_factors_arg,
+                                       &min_q_arg,
+                                       &max_q_arg,
+                                       &temporal_layers_arg,
+                                       &layering_mode_arg,
+                                       &threads_arg,
+                                       &aqmode_arg,
 #if CONFIG_AV1_HIGHBITDEPTH
-  &bitdepth_arg,
+                                       &bitdepth_arg,
 #endif
-  &speed_arg,           &bitrates_arg,   &dropframe_thresh_arg,
-  &error_resilient_arg, &output_obu_arg, NULL
-};
+                                       &speed_arg,
+                                       &bitrates_arg,
+                                       &dropframe_thresh_arg,
+                                       &error_resilient_arg,
+                                       &output_obu_arg,
+                                       &test_decode_arg,
+                                       &tune_content_arg,
+                                       NULL };
 
 #define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
@@ -261,6 +288,7 @@ static void parse_command_line(int argc, const char **argv_,
   svc_params->number_temporal_layers = 1;
   app_input->layering_mode = 0;
   app_input->output_obu = 0;
+  app_input->decode = 1;
   enc_cfg->g_threads = 1;
   enc_cfg->rc_end_usage = AOM_CBR;
 
@@ -342,6 +370,14 @@ static void parse_command_line(int argc, const char **argv_,
       if (app_input->output_obu != 0 && app_input->output_obu != 1)
         die("Invalid value for obu output flag (0, 1): %d.",
             app_input->output_obu);
+    } else if (arg_match(&arg, &test_decode_arg, argi)) {
+      app_input->decode = arg_parse_uint(&arg);
+      if (app_input->decode != 0 && app_input->decode != 1)
+        die("Invalid value for test decode flag (0, 1): %d.",
+            app_input->decode);
+    } else if (arg_match(&arg, &tune_content_arg, argi)) {
+      app_input->tune_content = arg_parse_enum_or_int(&arg);
+      printf("tune content %d\n", app_input->tune_content);
     } else {
       ++argj;
     }
@@ -566,6 +602,9 @@ static void set_layer_pattern(
     aom_svc_ref_frame_config_t *ref_frame_config,
     aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control,
     int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) {
+  // Setting this flag to 1 enables simplex example of
+  // RPS (Reference Picture Selection) for 1 layer.
+  int use_rps_example = 0;
   int i;
   int enable_longterm_temporal_ref = 1;
   int shift = (layering_mode == 8) ? 2 : 0;
@@ -590,10 +629,64 @@ static void set_layer_pattern(
   }
   switch (layering_mode) {
     case 0:
-      // 1-layer: update LAST on every frame, reference LAST.
-      layer_id->temporal_layer_id = 0;
-      ref_frame_config->refresh[0] = 1;
-      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      if (use_rps_example == 0) {
+        // 1-layer: update LAST on every frame, reference LAST.
+        layer_id->temporal_layer_id = 0;
+        layer_id->spatial_layer_id = 0;
+        ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      } else {
+        // Pattern of 2 references (ALTREF and GOLDEN) trailing
+        // LAST by 4 and 8 frame, with some switching logic to
+        // sometimes only predict from longer-term reference.
+        // This is simple example to test RPS (reference picture selection)
+        // as method to handle network packet loss.
+        int last_idx = 0;
+        int last_idx_refresh = 0;
+        int gld_idx = 0;
+        int alt_ref_idx = 0;
+        int lag_alt = 4;
+        int lag_gld = 8;
+        layer_id->temporal_layer_id = 0;
+        layer_id->spatial_layer_id = 0;
+        int sh = 8;  // slots 0 - 7.
+        // Moving index slot for last: 0 - (sh - 1)
+        if (superframe_cnt > 1) last_idx = (superframe_cnt - 1) % sh;
+        // Moving index for refresh of last: one ahead for next frame.
+        last_idx_refresh = superframe_cnt % sh;
+        // Moving index for gld_ref, lag behind current by lag_gld
+        if (superframe_cnt > lag_gld) gld_idx = (superframe_cnt - lag_gld) % sh;
+        // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+        if (superframe_cnt > lag_alt)
+          alt_ref_idx = (superframe_cnt - lag_alt) % sh;
+        // Set the ref_idx.
+        // Default all references to slot for last.
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = last_idx;
+        // Set the ref_idx for the relevant references.
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = last_idx;
+        ref_frame_config->ref_idx[SVC_LAST2_FRAME] = last_idx_refresh;
+        ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = gld_idx;
+        ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = alt_ref_idx;
+        // Refresh this slot, which will become LAST on next frame.
+        ref_frame_config->refresh[last_idx_refresh] = 1;
+        // Reference LAST, ALTREF, and GOLDEN
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+        // Switch to only ALTREF for frames 200 to 250.
+        if (superframe_cnt >= 200 && superframe_cnt < 250) {
+          ref_frame_config->reference[SVC_LAST_FRAME] = 0;
+          ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+          ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0;
+        }
+        // Switch to only GOLDEN for frames 400 to 450.
+        if (superframe_cnt >= 400 && superframe_cnt < 450) {
+          ref_frame_config->reference[SVC_LAST_FRAME] = 0;
+          ref_frame_config->reference[SVC_ALTREF_FRAME] = 0;
+          ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+        }
+      }
       break;
     case 1:
       // 2-temporal layer.
@@ -1250,8 +1343,10 @@ int main(int argc, const char **argv) {
     die("Failed to initialize encoder");
 
 #if CONFIG_AV1_DECODER
-  if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) {
-    die("Failed to initialize decoder");
+  if (app_input.decode) {
+    if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) {
+      die("Failed to initialize decoder");
+    }
   }
 #endif
 
@@ -1271,10 +1366,26 @@ int main(int argc, const char **argv) {
   aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1);
+
+  // Settings to reduce key frame encoding time.
+  aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_SMOOTH_INTRA, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_ANGLE_DELTA, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_FILTER_INTRA, 0);
+  aom_codec_control(&codec, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1);
+
   aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS,
                     cfg.g_threads ? get_msb(cfg.g_threads) : 0);
   if (cfg.g_threads > 1) aom_codec_control(&codec, AV1E_SET_ROW_MT, 1);
 
+  aom_codec_control(&codec, AV1E_SET_TUNE_CONTENT, app_input.tune_content);
+  if (app_input.tune_content == AOM_CONTENT_SCREEN) {
+    aom_codec_control(&codec, AV1E_SET_ENABLE_PALETTE, 1);
+    aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 1);
+    // INTRABC is currently disabled for rt mode, as it's too slow.
+    aom_codec_control(&codec, AV1E_SET_ENABLE_INTRABC, 0);
+  }
+
   svc_params.number_spatial_layers = ss_number_layers;
   svc_params.number_temporal_layers = ts_number_layers;
   for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
@@ -1367,15 +1478,44 @@ int main(int argc, const char **argv) {
       if (frame_avail && slx == 0) ++rc.layer_input_frames[layer];
 
       if (test_dynamic_scaling_single_layer) {
-        if (frame_cnt >= 200 && frame_cnt <= 400) {
+        // Example to scale source down by 2x2, then 4x4, and then back up to
+        // 2x2, and then back to original.
+        int frame_2x2 = 200;
+        int frame_4x4 = 400;
+        int frame_2x2up = 600;
+        int frame_orig = 800;
+        if (frame_cnt >= frame_2x2 && frame_cnt < frame_4x4) {
           // Scale source down by 2x2.
           struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
           aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
-        } else {
+        } else if (frame_cnt >= frame_4x4 && frame_cnt < frame_2x2up) {
+          // Scale source down by 4x4.
+          struct aom_scaling_mode mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+          aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+        } else if (frame_cnt >= frame_2x2up && frame_cnt < frame_orig) {
+          // Source back up to 2x2.
+          struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+          aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+        } else if (frame_cnt >= frame_orig) {
           // Source back up to original resolution (no scaling).
           struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
           aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
         }
+        if (frame_cnt == frame_2x2 || frame_cnt == frame_4x4 ||
+            frame_cnt == frame_2x2up || frame_cnt == frame_orig) {
+          // For dynamic resize testing on single layer: refresh all references
+          // on the resized frame: this is to avoid decode error:
+          // if resize goes down by >= 4x4 then libaom decoder will throw an
+          // error that some reference (even though not used) is beyond the
+          // limit size (must be smaller than 4x4).
+          for (i = 0; i < REF_FRAMES; i++) ref_frame_config.refresh[i] = 1;
+          if (use_svc_control) {
+            aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+                              &ref_frame_config);
+            aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+                              &ref_frame_comp_pred);
+          }
+        }
       }
 
       // Do the layer encode.
@@ -1460,9 +1600,11 @@ int main(int argc, const char **argv) {
             }
 
 #if CONFIG_AV1_DECODER
-            if (aom_codec_decode(&decoder, pkt->data.frame.buf,
-                                 (unsigned int)pkt->data.frame.sz, NULL))
-              die_codec(&decoder, "Failed to decode frame.");
+            if (app_input.decode) {
+              if (aom_codec_decode(&decoder, pkt->data.frame.buf,
+                                   (unsigned int)pkt->data.frame.sz, NULL))
+                die_codec(&decoder, "Failed to decode frame.");
+            }
 #endif
 
             break;
@@ -1470,12 +1612,14 @@ int main(int argc, const char **argv) {
         }
       }
 #if CONFIG_AV1_DECODER
-      // Don't look for mismatch on top spatial and top temporal layers as they
-      // are non reference frames.
-      if ((ss_number_layers > 1 || ts_number_layers > 1) &&
-          !(layer_id.temporal_layer_id > 0 &&
-            layer_id.temporal_layer_id == (int)ts_number_layers - 1)) {
-        test_decode(&codec, &decoder, frame_cnt, &mismatch_seen);
+      if (app_input.decode) {
+        // Don't look for mismatch on top spatial and top temporal layers as
+        // they are non reference frames.
+        if ((ss_number_layers > 1 || ts_number_layers > 1) &&
+            !(layer_id.temporal_layer_id > 0 &&
+              layer_id.temporal_layer_id == (int)ts_number_layers - 1)) {
+          test_decode(&codec, &decoder, frame_cnt, &mismatch_seen);
+        }
       }
 #endif
     }  // loop over spatial layers
diff --git a/libs.doxy_template b/libs.doxy_template
index 6e042ac93..ba77751a5 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -103,14 +103,6 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
-# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all generated output in the proper direction.
-# Possible values are: None, LTR, RTL and Context.
-# The default value is: None.
-
-OUTPUT_TEXT_DIRECTION  = None
-
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -1820,16 +1812,6 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1910,16 +1892,6 @@ RTF_STYLESHEET_FILE    =
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -1982,15 +1954,6 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
@@ -2172,15 +2135,6 @@ EXTERNAL_PAGES         = YES
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2237,11 +2191,14 @@ DOT_FONTSIZE           = 10
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
diff --git a/test/acm_random.h b/test/acm_random.h
index 8b1d51aef..bc38ba4da 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -27,43 +27,52 @@ class ACMRandom {
   void Reset(int seed) { random_.Reseed(seed); }
 
   // Generates a random 31-bit unsigned integer from [0, 2^31).
-  uint32_t Rand31(void) {
+  uint32_t Rand31() {
     return random_.Generate(testing::internal::Random::kMaxRange);
   }
 
-  uint16_t Rand16(void) {
+  uint16_t Rand16() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 15) & 0xffff;
   }
 
-  int16_t Rand15Signed(void) {
+  int16_t Rand16Signed() { return static_cast<int16_t>(Rand16()); }
+
+  int16_t Rand15() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
-    return (value >> 17) & 0xffff;
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 16) & 0x7fff;
+  }
+
+  int16_t Rand15Signed() {
+    // Use 15 bits: values between 16383 (0x3FFF) and -16384 (0xC000).
+    return static_cast<int16_t>(Rand15()) - (1 << 14);
   }
 
-  uint16_t Rand12(void) {
+  uint16_t Rand12() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 19) & 0xfff;
   }
 
-  int16_t Rand9Signed(void) {
+  int16_t Rand9Signed() {
     // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
     const uint32_t value = random_.Generate(512);
     return static_cast<int16_t>(value) - 256;
   }
 
-  uint8_t Rand8(void) {
+  uint8_t Rand8() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 23) & 0xff;
   }
 
-  uint8_t Rand8Extremes(void) {
+  uint8_t Rand8Extremes() {
     // Returns a random value near 0 or near 255, to better exercise
     // saturation behavior.
     const uint8_t r = Rand8();
@@ -74,7 +83,7 @@ class ACMRandom {
 
   int operator()(int n) { return PseudoUniform(n); }
 
-  static int DeterministicSeed(void) { return 0xbaba; }
+  static int DeterministicSeed() { return 0xbaba; }
 
  private:
   testing::internal::Random random_;
diff --git a/test/allintra_end_to_end_test.cc b/test/allintra_end_to_end_test.cc
index 0b9047c97..98a7973c7 100644
--- a/test/allintra_end_to_end_test.cc
+++ b/test/allintra_end_to_end_test.cc
@@ -132,4 +132,14 @@ AV1_INSTANTIATE_TEST_SUITE(AllIntraEndToEndTest,
                            ::testing::Range(5, 9), ::testing::Range(0, 4),
                            ::testing::Values(1), ::testing::Values(1),
                            ::testing::Values(0, 1));
+
+INSTANTIATE_TEST_SUITE_P(
+    AV1MultiThreaded, AllIntraEndToEndTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::ValuesIn(kTestVectors), ::testing::Range(5, 9),
+        ::testing::Range(0, 4), ::testing::Values(6), ::testing::Values(1),
+        ::testing::Values(0, 1)));
+
 }  // namespace
diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc
index 6ee005834..ad48e73e3 100644
--- a/test/aom_image_test.cc
+++ b/test/aom_image_test.cc
@@ -42,7 +42,9 @@ TEST(AomImageTest, AomImgSetRectOverflow) {
 
   EXPECT_EQ(aom_img_set_rect(&img, 0, 0, kWidth, kHeight, 0), 0);
   // This would result in overflow because -1 is cast to UINT_MAX.
-  EXPECT_NE(aom_img_set_rect(&img, -1, -1, kWidth, kHeight, 0), 0);
+  EXPECT_NE(aom_img_set_rect(&img, static_cast<unsigned int>(-1),
+                             static_cast<unsigned int>(-1), kWidth, kHeight, 0),
+            0);
 }
 
 TEST(AomImageTest, AomImgAllocNv12) {
diff --git a/test/aomdec.sh b/test/aomdec.sh
index eb1649a03..e9738a8e8 100755
--- a/test/aomdec.sh
+++ b/test/aomdec.sh
@@ -131,6 +131,16 @@ aomdec_av1_obu_annexb() {
   fi
 }
 
+aomdec_av1_obu_annexb_pipe_input() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_OBU_ANNEXB_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 || return 1
+    fi
+    aomdec_pipe "${file}" --summary --noblit --annexb
+  fi
+}
+
 aomdec_av1_obu_section5() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_OBU_SEC5_FILE}"
@@ -141,6 +151,16 @@ aomdec_av1_obu_section5() {
   fi
 }
 
+aomdec_av1_obu_section5_pipe_input() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_OBU_SEC5_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu || return 1
+    fi
+    aomdec_pipe "${file}" --summary --noblit
+  fi
+}
+
 aomdec_av1_webm() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
@@ -186,6 +206,8 @@ if [ ! "$(realtime_only_build)" = "yes" ]; then
                 aomdec_av1_ivf_error_resilient
                 aomdec_av1_obu_annexb
                 aomdec_av1_obu_section5
+                aomdec_av1_obu_annexb_pipe_input
+                aomdec_av1_obu_section5_pipe_input
                 aomdec_av1_webm"
 fi
 
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index ebe3dfc15..12edfac64 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -468,9 +468,9 @@ class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
         get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
-    av1_highbd_convolve_x_sr(input, width, reference, kOutputStride, width,
-                             height, filter_params_x, sub_x, &conv_params1,
-                             bit_depth);
+    av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width,
+                               height, filter_params_x, sub_x, &conv_params1,
+                               bit_depth);
 
     ConvolveParams conv_params2 =
         get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
@@ -677,8 +677,8 @@ class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
         av1_get_interp_filter_params_with_block_size(filter, height);
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
-    av1_highbd_convolve_y_sr(input, width, reference, kOutputStride, width,
-                             height, filter_params_y, sub_y, bit_depth);
+    av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width,
+                               height, filter_params_y, sub_y, bit_depth);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
                               filter_params_y, sub_y, bit_depth);
@@ -751,7 +751,7 @@ class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
     const int height = GetParam().Block().Height();
     const uint8_t *input = FirstRandomInput8(GetParam());
     DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    aom_convolve_copy(input, width, reference, kOutputStride, width, height);
+    aom_convolve_copy_c(input, width, reference, kOutputStride, width, height);
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
@@ -780,16 +780,6 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyTest,
                          BuildLowbdParams(aom_convolve_copy_neon));
 #endif
 
-#if HAVE_MSA
-INSTANTIATE_TEST_SUITE_P(MSA, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_msa));
-#endif
-
-#if HAVE_DSPR2
-INSTANTIATE_TEST_SUITE_P(DSPR2, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_dspr2));
-#endif
-
 #if CONFIG_AV1_HIGHBITDEPTH
 ///////////////////////////////////////////////////////////////
 // Single reference convolve-copy functions (high bit-depth)
@@ -807,8 +797,8 @@ class AV1ConvolveCopyHighbdTest
     const int height = block.Height();
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
-    aom_highbd_convolve_copy(input, width, reference, kOutputStride, width,
-                             height);
+    aom_highbd_convolve_copy_c(input, width, reference, kOutputStride, width,
+                               height);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
@@ -1015,9 +1005,9 @@ class AV1Convolve2DHighbdTest
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     ConvolveParams conv_params1 =
         get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
-    av1_highbd_convolve_2d_sr(input, width, reference, kOutputStride, width,
-                              height, filter_params_x, filter_params_y, sub_x,
-                              sub_y, &conv_params1, bit_depth);
+    av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+                                height, filter_params_x, filter_params_y, sub_x,
+                                sub_y, &conv_params1, bit_depth);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     ConvolveParams conv_params2 =
         get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
@@ -1234,7 +1224,7 @@ class AV1ConvolveXCompoundTest : public AV1ConvolveTest<convolve_x_func> {
   }
 
   virtual convolve_x_func ReferenceFunc() const {
-    return av1_dist_wtd_convolve_x;
+    return av1_dist_wtd_convolve_x_c;
   }
 
  private:
@@ -1324,7 +1314,7 @@ class AV1ConvolveXHighbdCompoundTest
   }
 
   virtual highbd_convolve_x_func ReferenceFunc() const {
-    return av1_highbd_dist_wtd_convolve_x;
+    return av1_highbd_dist_wtd_convolve_x_c;
   }
 
  private:
@@ -1403,7 +1393,7 @@ class AV1ConvolveYCompoundTest : public AV1ConvolveXCompoundTest {
   }
 
   convolve_x_func ReferenceFunc() const override {
-    return av1_dist_wtd_convolve_y;
+    return av1_dist_wtd_convolve_y_c;
   }
 };
 
@@ -1435,7 +1425,7 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYCompoundTest,
 // Again, the X and Y convolve functions have the same type signature and logic.
 class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest {
   highbd_convolve_x_func ReferenceFunc() const override {
-    return av1_highbd_dist_wtd_convolve_y;
+    return av1_highbd_dist_wtd_convolve_y_c;
   }
   const InterpFilterParams *FilterParams(
       InterpFilter f, const BlockSize &block) const override {
@@ -1479,6 +1469,11 @@ class AV1Convolve2DCopyCompoundTest
       TestConvolve(compound);
     }
   }
+  void SpeedTest() {
+    for (const auto &compound : GetCompoundParams()) {
+      TestConvolveSpeed(compound, 100000);
+    }
+  }
 
  private:
   void TestConvolve(const CompoundParam &compound) {
@@ -1490,7 +1485,7 @@ class AV1Convolve2DCopyCompoundTest
     const uint8_t *input2 = SecondRandomInput8(GetParam());
     DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(av1_dist_wtd_convolve_2d_copy, input1, input2, reference,
+    Convolve(av1_dist_wtd_convolve_2d_copy_c, input1, input2, reference,
              reference_conv_buf, compound);
 
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
@@ -1502,7 +1497,45 @@ class AV1Convolve2DCopyCompoundTest
     AssertOutputBufferEq(reference, test, width, height);
   }
 
- private:
+  void TestConvolveSpeed(const CompoundParam &compound, const int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const uint8_t *src0 = FirstRandomInput8(GetParam());
+    const uint8_t *src1 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, dst[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, conv_buf[MAX_SB_SQUARE]);
+
+    const auto test_func = GetParam().TestFunction();
+
+    ConvolveParams conv_params_0 =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+    ConvolveParams conv_params_1 =
+        GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_dist_wtd_convolve_2d_copy_c(src0, width, dst, kOutputStride, width,
+                                      height, &conv_params_0);
+      av1_dist_wtd_convolve_2d_copy_c(src1, width, dst, kOutputStride, width,
+                                      height, &conv_params_1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      test_func(src0, width, dst, kOutputStride, width, height, &conv_params_0);
+      test_func(src1, width, dst, kOutputStride, width, height, &conv_params_1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("Dist Weighted: %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n",
+           compound.UseDistWtdCompAvg(), width, height, time1, time2,
+           time1 / time2);
+  }
+
   void Convolve(compound_conv_2d_copy_func test_func, const uint8_t *src1,
                 const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
                 const CompoundParam &compound) {
@@ -1519,6 +1552,7 @@ class AV1Convolve2DCopyCompoundTest
 };
 
 TEST_P(AV1Convolve2DCopyCompoundTest, RunTest) { RunTest(); }
+TEST_P(AV1Convolve2DCopyCompoundTest, DISABLED_SpeedTest) { SpeedTest(); }
 
 INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCopyCompoundTest,
                          BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_c));
@@ -1571,7 +1605,7 @@ class AV1Convolve2DCopyHighbdCompoundTest
     const uint16_t *input2 = SecondRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(av1_highbd_dist_wtd_convolve_2d_copy, input1, input2, reference,
+    Convolve(av1_highbd_dist_wtd_convolve_2d_copy_c, input1, input2, reference,
              reference_conv_buf, compound);
 
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
@@ -1658,7 +1692,7 @@ class AV1Convolve2DCompoundTest : public AV1ConvolveTest<convolve_2d_func> {
     const uint8_t *input2 = SecondRandomInput8(GetParam());
     DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(av1_dist_wtd_convolve_2d, input1, input2, reference,
+    Convolve(av1_dist_wtd_convolve_2d_c, input1, input2, reference,
              reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
 
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
@@ -1756,7 +1790,7 @@ class AV1Convolve2DHighbdCompoundTest
     const uint16_t *input2 = SecondRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(av1_highbd_dist_wtd_convolve_2d, input1, input2, reference,
+    Convolve(av1_highbd_dist_wtd_convolve_2d_c, input1, input2, reference,
              reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
 
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index c77f501c7..221dd105a 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -28,12 +28,14 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace AV1Kmeans {
-typedef void (*av1_calc_indices_dim1_func)(const int *data,
-                                           const int *centroids,
-                                           uint8_t *indices, int n, int k);
-typedef void (*av1_calc_indices_dim2_func)(const int *data,
-                                           const int *centroids,
-                                           uint8_t *indices, int n, int k);
+typedef void (*av1_calc_indices_dim1_func)(const int16_t *data,
+                                           const int16_t *centroids,
+                                           uint8_t *indices,
+                                           int64_t *total_dist, int n, int k);
+typedef void (*av1_calc_indices_dim2_func)(const int16_t *data,
+                                           const int16_t *centroids,
+                                           uint8_t *indices,
+                                           int64_t *total_dist, int n, int k);
 
 typedef std::tuple<av1_calc_indices_dim1_func, BLOCK_SIZE>
     av1_calc_indices_dim1Param;
@@ -66,8 +68,8 @@ class AV1KmeansTest1
   }
 
   libaom_test::ACMRandom rnd_;
-  int data_[4096];
-  int centroids_[8];
+  int16_t data_[4096];
+  int16_t centroids_[8];
   uint8_t indices1_[4096];
   uint8_t indices2_[4096];
 };
@@ -92,9 +94,11 @@ void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   const int n = w * h;
-  av1_calc_indices_dim1_c(data_, centroids_, indices1_, n, k);
-  test_impl(data_, centroids_, indices2_, n, k);
+  int64_t total_dist_dim1, total_dist_impl;
+  av1_calc_indices_dim1_c(data_, centroids_, indices1_, &total_dist_dim1, n, k);
+  test_impl(data_, centroids_, indices2_, &total_dist_impl, n, k);
 
+  ASSERT_EQ(total_dist_dim1, total_dist_impl);
   ASSERT_EQ(CheckResult(n), true)
       << " block " << bsize << " index " << n << " Centroids " << k;
 }
@@ -113,7 +117,7 @@ void AV1KmeansTest1::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
     aom_usec_timer_start(&timer);
     av1_calc_indices_dim1_func func = funcs[i];
     for (int j = 0; j < num_loops; ++j) {
-      func(data_, centroids_, indices1_, n, k);
+      func(data_, centroids_, indices1_, /*total_dist=*/nullptr, n, k);
     }
     aom_usec_timer_mark(&timer);
     double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
@@ -174,8 +178,8 @@ class AV1KmeansTest2
   }
 
   libaom_test::ACMRandom rnd_;
-  int data_[4096 * 2];
-  int centroids_[8 * 2];
+  int16_t data_[4096 * 2];
+  int16_t centroids_[8 * 2];
   uint8_t indices1_[4096];
   uint8_t indices2_[4096];
 };
@@ -200,9 +204,11 @@ void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   const int n = w * h;
-  av1_calc_indices_dim2_c(data_, centroids_, indices1_, n, k);
-  test_impl(data_, centroids_, indices2_, n, k);
+  int64_t total_dist_dim2, total_dist_impl;
+  av1_calc_indices_dim2_c(data_, centroids_, indices1_, &total_dist_dim2, n, k);
+  test_impl(data_, centroids_, indices2_, &total_dist_impl, n, k);
 
+  ASSERT_EQ(total_dist_dim2, total_dist_impl);
   ASSERT_EQ(CheckResult(n), true)
       << " block " << bsize << " index " << n << " Centroids " << k;
 }
@@ -221,7 +227,7 @@ void AV1KmeansTest2::RunSpeedTest(av1_calc_indices_dim2_func test_impl,
     aom_usec_timer_start(&timer);
     av1_calc_indices_dim2_func func = funcs[i];
     for (int j = 0; j < num_loops; ++j) {
-      func(data_, centroids_, indices1_, n, k);
+      func(data_, centroids_, indices1_, /*total_dist=*/nullptr, n, k);
     }
     aom_usec_timer_mark(&timer);
     double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
@@ -278,12 +284,10 @@ INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1KmeansTest1,
     ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_sse2),
                        ::testing::ValuesIn(kValidBlockSize)));
-// TODO(any): Disable av1_calc_indices_dim2 sse2 SIMD and its unit test due to
-// c/SIMD mismatch. Re-enable it after mismatch is fixed.
-// INSTANTIATE_TEST_SUITE_P(
-//    SSE2, AV1KmeansTest2,
-//    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_sse2),
-//                       ::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1KmeansTest2,
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_sse2),
+                       ::testing::ValuesIn(kValidBlockSize)));
 #endif
 
 }  // namespace AV1Kmeans
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 731e99cb9..582364703 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -79,8 +79,8 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
       }
 
       for (int j = 0; j < 2; j++) {
-        zbin_ptr[j] = rnd.Rand16();
-        quant_shift_ptr[j] = rnd.Rand16();
+        zbin_ptr[j] = rnd.Rand16Signed();
+        quant_shift_ptr[j] = rnd.Rand16Signed();
         // int16_t positive
         dequant_ptr[j] = abs(rnd(dequantRange));
         quant_ptr[j] = static_cast<int16_t>((1 << 16) / dequant_ptr[j]);
@@ -155,8 +155,8 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
       coeff_ptr[rnd(count)] = rnd(coeffRange);
 
       for (int j = 0; j < 2; j++) {
-        zbin_ptr[j] = rnd.Rand16();
-        quant_shift_ptr[j] = rnd.Rand16();
+        zbin_ptr[j] = rnd.Rand16Signed();
+        quant_shift_ptr[j] = rnd.Rand16Signed();
         // int16_t positive
         dequant_ptr[j] = abs(rnd(dequantRange));
         quant_ptr[j] = (1 << 16) / dequant_ptr[j];
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index a51ce12b2..46f6d923c 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc
@@ -341,7 +341,7 @@ TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
 
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
     for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-      a[i] = rng_.Rand16();
+      a[i] = rng_.Rand16Signed();
       b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX;
     }
 
@@ -374,6 +374,13 @@ INSTANTIATE_TEST_SUITE_P(
                                    av1_wedge_compute_delta_squares_sse2)));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+                                    av1_wedge_sse_from_residuals_neon)));
+#endif  // HAVE_NEON
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, WedgeUtilsSSEOptTest,
diff --git a/test/avg_test.cc b/test/avg_test.cc
index b12d1ef4e..4e86f06a2 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,9 @@ class AverageTestBase : public ::testing::Test {
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockWidth = 128;
+  static const int kDataBlockHeight = 128;
+  static const int kDataBlockSize = kDataBlockWidth * kDataBlockHeight;
 
   virtual void SetUp() {
     const testing::TestInfo *const test_info =
@@ -236,13 +238,11 @@ class AverageTest_8x8_quad
   using AverageTestBase<Pixel>::FillConstant;
   using AverageTestBase<Pixel>::FillRandom;
 
-  void CheckAverages(int iterations) {
+  void CheckAveragesAt(int iterations, int x16_idx, int y16_idx) {
     ASSERT_EQ(sizeof(Pixel), 1u);
     const int block_size = GET_PARAM(4);
     (void)block_size;
     int expected[4] = { 0 };
-    int x16_idx = 0;
-    int y16_idx = 0;
 
     // The reference frame, but not the source frame, may be unaligned for
     // certain types of searches.
@@ -285,19 +285,25 @@ class AverageTest_8x8_quad
     }
   }
 
+  void CheckAverages() {
+    for (int x16_idx = 0; x16_idx < this->kDataBlockWidth / 8; x16_idx += 2)
+      for (int y16_idx = 0; y16_idx < this->kDataBlockHeight / 8; y16_idx += 2)
+        CheckAveragesAt(1, x16_idx, y16_idx);
+  }
+
   void TestConstantValue(Pixel value) {
     FillConstant(value);
-    CheckAverages(1);
+    CheckAverages();
   }
 
   void TestRandom() {
     FillRandom();
-    CheckAverages(1);
+    CheckAverages();
   }
 
   void TestSpeed() {
     FillRandom();
-    CheckAverages(1000000);
+    CheckAveragesAt(1000000, 0, 0);
   }
 
   int64_t ref_elapsed_time_ = 0;
@@ -343,20 +349,32 @@ TEST_P(AverageTestHbd, DISABLED_Speed) {
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
-                              const int ref_stride, const int height);
+typedef void (*IntProRowFunc)(int16_t *hbuf, uint8_t const *ref,
+                              const int ref_stride, const int width,
+                              const int height, int norm_factor);
 
-// Params: height, asm function, c function.
-typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProRowFunc, IntProRowFunc> IntProRowParam;
 
 class IntProRowTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProRowParam> {
  public:
   IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+      : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), hbuf_asm_(nullptr),
         hbuf_c_(nullptr) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
+    asm_func_ = GET_PARAM(2);
+    c_func_ = GET_PARAM(3);
+  }
+
+  void set_norm_factor() {
+    if (height_ == 128)
+      norm_factor_ = 6;
+    else if (height_ == 64)
+      norm_factor_ = 5;
+    else if (height_ == 32)
+      norm_factor_ = 4;
+    else if (height_ == 16)
+      norm_factor_ = 3;
   }
 
  protected:
@@ -366,10 +384,10 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
     ASSERT_NE(source_data_, nullptr);
 
     hbuf_asm_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * width_));
     ASSERT_NE(hbuf_asm_, nullptr);
     hbuf_c_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * width_));
     ASSERT_NE(hbuf_c_, nullptr);
   }
 
@@ -383,19 +401,24 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
   void RunComparison() {
-    API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+    set_norm_factor();
+    API_REGISTER_STATE_CHECK(
+        c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, width_, width_,
+                                       height_, norm_factor_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
         << "Output mismatch\n";
   }
 
   void RunSpeedTest() {
     const int numIter = 5000000;
-    printf("Height = %d number of iteration is %d \n", height_, numIter);
+    set_norm_factor();
+    printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+           numIter);
     aom_usec_timer c_timer_;
     aom_usec_timer_start(&c_timer_);
     for (int i = 0; i < numIter; i++) {
-      c_func_(hbuf_c_, source_data_, 0, height_);
+      c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&c_timer_);
 
@@ -403,7 +426,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
     aom_usec_timer_start(&asm_timer_);
 
     for (int i = 0; i < numIter; i++) {
-      asm_func_(hbuf_asm_, source_data_, 0, height_);
+      asm_func_(hbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&asm_timer_);
 
@@ -415,7 +438,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
            asm_sum_time,
            (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
 
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
         << "Output mismatch\n";
   }
 
@@ -424,35 +447,68 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   IntProRowFunc c_func_;
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
+  int norm_factor_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+typedef void (*IntProColFunc)(int16_t *vbuf, uint8_t const *ref,
+                              const int ref_stride, const int width,
+                              const int height, int norm_factor);
 
-// Params: width, asm function, c function.
-typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProColFunc, IntProColFunc> IntProColParam;
 
 class IntProColTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProColParam> {
  public:
-  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
+  IntProColTest()
+      : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), vbuf_asm_(nullptr),
+        vbuf_c_(nullptr) {
+    asm_func_ = GET_PARAM(2);
+    c_func_ = GET_PARAM(3);
   }
 
  protected:
+  virtual void SetUp() {
+    source_data_ = static_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_NE(source_data_, nullptr);
+
+    vbuf_asm_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*vbuf_asm_) * width_));
+    ASSERT_NE(vbuf_asm_, nullptr);
+    vbuf_c_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*vbuf_c_) * width_));
+    ASSERT_NE(vbuf_c_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = nullptr;
+    aom_free(vbuf_c_);
+    vbuf_c_ = nullptr;
+    aom_free(vbuf_asm_);
+    vbuf_asm_ = nullptr;
+  }
+
   void RunComparison() {
-    API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+    int norm_factor_ = 3 + (width_ >> 5);
+    API_REGISTER_STATE_CHECK(
+        c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+    API_REGISTER_STATE_CHECK(asm_func_(vbuf_asm_, source_data_, width_, width_,
+                                       height_, norm_factor_));
+    EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+        << "Output mismatch\n";
   }
   void RunSpeedTest() {
     const int numIter = 5000000;
-    printf("Width = %d number of iteration is %d \n", width_, numIter);
+    printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+           numIter);
+    int norm_factor_ = 3 + (width_ >> 5);
     aom_usec_timer c_timer_;
     aom_usec_timer_start(&c_timer_);
     for (int i = 0; i < numIter; i++) {
-      sum_c_ = c_func_(source_data_, width_);
+      c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&c_timer_);
 
@@ -460,7 +516,7 @@ class IntProColTest : public AverageTestBase<uint8_t>,
     aom_usec_timer_start(&asm_timer_);
 
     for (int i = 0; i < numIter; i++) {
-      sum_asm_ = asm_func_(source_data_, width_);
+      asm_func_(vbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&asm_timer_);
 
@@ -472,14 +528,15 @@ class IntProColTest : public AverageTestBase<uint8_t>,
            asm_sum_time,
            (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
 
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+    EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+        << "Output mismatch\n";
   }
 
  private:
   IntProColFunc asm_func_;
   IntProColFunc c_func_;
-  int16_t sum_asm_;
-  int16_t sum_c_;
+  int16_t *vbuf_asm_;
+  int16_t *vbuf_c_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
 
@@ -636,7 +693,7 @@ TEST_P(VectorVarTest, Random) {
 }
 TEST_P(VectorVarTest, DISABLED_Speed) {
   FillRandom();
-  const int numIter = 50000;
+  const int numIter = 5000000;
   printf("Width = %d number of iteration is %d \n", width, numIter);
 
   int sum_c_var = 0;
@@ -703,19 +760,19 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_sse2,
-                                 &aom_int_pro_row_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_sse2, &aom_int_pro_row_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_sse2,
-                                 &aom_int_pro_col_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_sse2, &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_AVX2
@@ -724,6 +781,22 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
                       make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
                       make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IntProRowTest,
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_avx2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IntProColTest,
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_avx2, &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_NEON
@@ -737,19 +810,19 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
 INSTANTIATE_TEST_SUITE_P(
     NEON, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_neon,
-                                 &aom_int_pro_row_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_neon, &aom_int_pro_row_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_neon,
-                                 &aom_int_pro_col_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_neon, &aom_int_pro_col_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, AvgTest8bpp_avg_8x8_quad,
@@ -942,6 +1015,16 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(5, &aom_vector_var_c,
+                                 &aom_vector_var_sse4_1)));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, SatdTest,
@@ -950,7 +1033,14 @@ INSTANTIATE_TEST_SUITE_P(
                       SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
                       SatdTestParam<SatdFunc>(1024, &aom_satd_c,
                                               &aom_satd_avx2)));
-#endif
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_avx2),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_avx2),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_avx2),
+                      make_tuple(5, &aom_vector_var_c, &aom_vector_var_avx2)));
+#endif  // HAVE_AVX2
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
new file mode 100644
index 000000000..cf94fd5c2
--- /dev/null
+++ b/test/avif_progressive_test.cc
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstddef>
+#include <vector>
+
+#include "aom/aomcx.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// This test emulates how libavif calls libaom functions to encode a
+// progressive AVIF image in libavif's ProgressiveTest.QualityChange test.
+TEST(AVIFProgressiveTest, QualityChange) {
+  constexpr int kWidth = 256;
+  constexpr int kHeight = 256;
+  // Dummy buffer of neutral gray samples.
+  constexpr size_t kBufferSize = 3 * kWidth * kHeight;
+  std::vector<unsigned char> buffer(kBufferSize,
+                                    static_cast<unsigned char>(128));
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I444, kWidth, kHeight, 1,
+                               buffer.data()));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  const unsigned int usage = AOM_USAGE_GOOD_QUALITY;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, usage));
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  cfg.rc_end_usage = AOM_Q;
+  cfg.g_profile = 1;
+  cfg.g_bit_depth = AOM_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_min_quantizer = 50;
+  cfg.rc_max_quantizer = 50;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 50));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+  // First frame (layer 0)
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Second frame (layer 1)
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 0;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_LOSSLESS, 1));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 1));
+  aom_enc_frame_flags_t encode_flags =
+      AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+      AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// This test emulates how libavif calls libaom functions to encode a
+// progressive AVIF image in libavif's ProgressiveTest.DimensionChange test.
+TEST(AVIFProgressiveTest, DimensionChange) {
+  constexpr int kWidth = 256;
+  constexpr int kHeight = 256;
+  // Dummy buffer of neutral gray samples.
+  constexpr size_t kBufferSize = 3 * kWidth * kHeight;
+  std::vector<unsigned char> buffer(kBufferSize,
+                                    static_cast<unsigned char>(128));
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I444, kWidth, kHeight, 1,
+                               buffer.data()));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  const unsigned int usage = AOM_USAGE_GOOD_QUALITY;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, usage));
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  cfg.rc_end_usage = AOM_Q;
+  cfg.g_profile = 1;
+  cfg.g_bit_depth = AOM_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 0;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_LOSSLESS, 1));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+  // First frame (layer 0)
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
+  aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Second frame (layer 1)
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 1));
+  aom_enc_frame_flags_t encode_flags =
+      AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+      AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+}  // namespace
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 98cc9ab83..97533da5b 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -192,7 +192,7 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubAvgTest);
 
 TEST_P(CFLSubAvgTest, SubAvgTest) {
   for (int it = 0; it < NUM_ITERATIONS; it++) {
-    randData(&ACMRandom::Rand15Signed);
+    randData(&ACMRandom::Rand15);
     sub_avg((uint16_t *)data, data);
     sub_avg_ref((uint16_t *)data_ref, data_ref);
     assert_eq<int16_t>(data, data_ref, width, height);
@@ -202,7 +202,7 @@ TEST_P(CFLSubAvgTest, SubAvgTest) {
 TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) {
   aom_usec_timer ref_timer;
   aom_usec_timer timer;
-  randData(&ACMRandom::Rand15Signed);
+  randData(&ACMRandom::Rand15);
   aom_usec_timer_start(&ref_timer);
   for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
     sub_avg_ref((uint16_t *)data_ref, data_ref);
diff --git a/test/cnn_test.cc b/test/cnn_test.cc
index 96bb9e354..77d8d5562 100644
--- a/test/cnn_test.cc
+++ b/test/cnn_test.cc
@@ -26,7 +26,7 @@
 
 #define SQR(x) ((x) * (x))
 
-// Best possible pixelwise guarenteed preicison given each float has at most
+// Best possible pixelwise guaranteed precision given each float has at most
 // 3 specified decimals.
 #define PIXELWISE_FLOAT_TOL 1E-2
 
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index e59cc2766..673205acd 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc
@@ -12,14 +12,14 @@
 #include <new>
 #include <tuple>
 
-#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "test/register_state_check.h"
 
-#include "av1/encoder/corner_match.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
 
 namespace test_libaom {
 
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index ffa41e0dc..8fdc66216 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -85,6 +85,66 @@ class DatarateTestLarge
         << " The datarate for the file is greater than target by too much!";
   }
 
+  virtual void BasicRateTargetingCBRSpikeTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_max_dist = 3000;
+    cfg_.kf_min_dist = 3000;
+
+    ::libaom_test::I420VideoSource video("desktopqvga2.320_240.yuv", 320, 240,
+                                         30, 1, 0, 800);
+    const int bitrate_array[2] = { 100, 200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    max_perc_spike_ = 3.0;
+    max_perc_spike_high_ = 8.0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.19)
+        << " The datarate for the file is greater than target by too much!";
+    ASSERT_LT(num_spikes_, 8);
+    ASSERT_LT(num_spikes_high_, 1);
+  }
+
+  virtual void BasicRateTargetingCBRDynamicBitrateTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_max_dist = 3000;
+    cfg_.kf_min_dist = 3000;
+
+    ::libaom_test::I420VideoSource video("desktop1.320_180.yuv", 320, 180, 30,
+                                         1, 0, 800);
+    const int bitrate_array[2] = { 100, 200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    target_bitrate_update_[0] = cfg_.rc_target_bitrate;
+    target_bitrate_update_[1] = static_cast<int>(1.3 * cfg_.rc_target_bitrate);
+    target_bitrate_update_[2] = static_cast<int>(0.7 * cfg_.rc_target_bitrate);
+    frame_update_bitrate_ = 250;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < 3; i++) {
+      ASSERT_GE(effective_datarate_dynamic_[i],
+                target_bitrate_update_[i] * 0.85)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_dynamic_[i],
+                target_bitrate_update_[i] * 1.20)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   virtual void BasicRateTargetingMultiThreadCBRTest() {
     ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
                                          1, 0, 400);
@@ -287,7 +347,7 @@ class DatarateTestFrameDropLarge
       ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
       ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.17)
+      ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.31)
           << " The datarate for the file is greater than target by too much!";
       if (last_drop > 0) {
         ASSERT_LE(first_drop_, last_drop)
@@ -399,7 +459,7 @@ class DatarateTestSpeedChangeRealtime
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.83)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.24)
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.35)
         << " The datarate for the file is greater than target by too much!";
   }
 };
@@ -414,6 +474,19 @@ TEST_P(DatarateTestRealtime, BasicRateTargetingCBR) {
   BasicRateTargetingCBRTest();
 }
 
+// Check basic rate targeting for CBR. Use a longer clip,
+// and verify #encode size spikes above threshold.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBRSpike) {
+  BasicRateTargetingCBRSpikeTest();
+}
+
+// Check basic rate targeting for CBR. Use a longer clip,
+// and verify encoder can respnd and hit new bitrates updated
+// within the stream.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBRDynamicBitrate) {
+  BasicRateTargetingCBRDynamicBitrateTest();
+}
+
 // Check basic rate targeting for CBR, with 4 threads
 TEST_P(DatarateTestRealtime, BasicRateTargetingMultiThreadCBR) {
   BasicRateTargetingMultiThreadCBRTest();
diff --git a/test/datarate_test.h b/test/datarate_test.h
index cb5d6e5a7..4b74c6533 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -44,6 +44,17 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     denoiser_offon_period_ = -1;
     tile_column_ = 0;
     screen_mode_ = false;
+    max_perc_spike_ = 1.0;
+    max_perc_spike_high_ = 1.0;
+    num_spikes_ = 0;
+    num_spikes_high_ = 0;
+    frame_update_bitrate_ = 0;
+    for (int i = 0; i < 3; i++) {
+      target_bitrate_update_[i] = 0;
+      frame_number_dynamic_[i] = 0;
+      bits_total_dynamic_[i] = 0;
+      effective_datarate_dynamic_[i] = 0.0;
+    }
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -85,6 +96,16 @@ class DatarateTest : public ::libaom_test::EncoderTest {
       }
     }
 
+    if (frame_update_bitrate_ > 0) {
+      if (frame_number_ == frame_update_bitrate_) {
+        cfg_.rc_target_bitrate = target_bitrate_update_[1];
+        encoder->Config(&cfg_);
+      } else if (frame_number_ == 2 * frame_update_bitrate_) {
+        cfg_.rc_target_bitrate = target_bitrate_update_[2];
+        encoder->Config(&cfg_);
+      }
+    }
+
     if (denoiser_offon_test_) {
       ASSERT_GT(denoiser_offon_period_, 0)
           << "denoiser_offon_period_ is not positive.";
@@ -132,12 +153,38 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     last_pts_ = pkt->data.frame.pts;
     ++frame_number_;
     ++tot_frame_number_;
+    const int per_frame_bandwidth = (cfg_.rc_target_bitrate * 1000) / 30;
+    if (frame_size_in_bits > max_perc_spike_ * per_frame_bandwidth &&
+        frame_number_ > 1)
+      num_spikes_++;
+    if (frame_size_in_bits > max_perc_spike_high_ * per_frame_bandwidth &&
+        frame_number_ > 1)
+      num_spikes_high_++;
+
+    if (frame_update_bitrate_ > 0) {
+      if (frame_number_ < frame_update_bitrate_) {
+        bits_total_dynamic_[0] += frame_size_in_bits;
+        frame_number_dynamic_[0]++;
+      } else if (frame_number_ >= frame_update_bitrate_ &&
+                 frame_number_ < 2 * frame_update_bitrate_) {
+        bits_total_dynamic_[1] += frame_size_in_bits;
+        frame_number_dynamic_[1]++;
+      } else {
+        bits_total_dynamic_[2] += frame_size_in_bits;
+        frame_number_dynamic_[2]++;
+      }
+    }
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     duration_ = (last_pts_ + 1) * timebase_;
     // Effective file datarate:
     effective_datarate_ = (bits_total_ / 1000.0) / duration_;
+    if (frame_update_bitrate_ > 0) {
+      for (int i = 0; i < 3; i++)
+        effective_datarate_dynamic_[i] =
+            30 * (bits_total_dynamic_[i] / 1000.0) / frame_number_dynamic_[i];
+    }
   }
 
   aom_codec_pts_t last_pts_;
@@ -158,6 +205,18 @@ class DatarateTest : public ::libaom_test::EncoderTest {
   bool speed_change_test_;
   int tile_column_;
   bool screen_mode_;
+  double max_perc_spike_;
+  double max_perc_spike_high_;
+  int num_spikes_;
+  int num_spikes_high_;
+  // These are use for test with dynamic bitrate change.
+  // Used to verify that the encoder can respond and hit bitrate that is updated
+  // during the sequence.
+  int frame_update_bitrate_;
+  int target_bitrate_update_[3];
+  double effective_datarate_dynamic_[3];
+  int64_t bits_total_dynamic_[3];
+  int frame_number_dynamic_[3];
 };
 
 }  // namespace
diff --git a/test/deltaq_mode_test.cc b/test/deltaq_mode_test.cc
new file mode 100644
index 000000000..0a5e5aaca
--- /dev/null
+++ b/test/deltaq_mode_test.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstddef>
+#include <vector>
+
+#include "aom/aomcx.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+/*
+   Reproduces https://crbug.com/aomedia/3376. Emulates the command line:
+
+   ./aomenc --cpu-used=6 --threads=10 --cq-level=14 --passes=1 --limit=1 \
+     --lag-in-frames=0 --end-usage=q --deltaq-mode=3 --min-q=0 --max-q=63 \
+     -o output.av1 niklas_1280_720_30.y4m
+*/
+TEST(DeltaqModeTest, DeltaqMode3MultiThread) {
+  constexpr int kWidth = 1280;
+  constexpr int kHeight = 720;
+  // Dummy buffer of neutral gray samples.
+  constexpr size_t kBufferSize = kWidth * kHeight + kWidth * kHeight / 2;
+  std::vector<unsigned char> buffer(kBufferSize,
+                                    static_cast<unsigned char>(128));
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               buffer.data()));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY),
+            AOM_CODEC_OK);
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  cfg.g_threads = 10;
+  cfg.rc_end_usage = AOM_Q;
+  cfg.g_profile = 0;
+  cfg.g_bit_depth = AOM_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 63;
+  cfg.g_pass = AOM_RC_ONE_PASS;
+  cfg.g_limit = 1;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 14), AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_DELTAQ_MODE, 3), AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_set_option(&enc, "passes", "1"), AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_STUDIO_RANGE),
+            AOM_CODEC_OK);
+
+  EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+}  // namespace
diff --git a/test/ducky_encode_test.cc b/test/ducky_encode_test.cc
index ce642536d..7bbdc88df 100644
--- a/test/ducky_encode_test.cc
+++ b/test/ducky_encode_test.cc
@@ -19,9 +19,12 @@
 #include <string>
 #include <vector>
 
-#include "av1/ducky_encode.h"
 #include "av1/encoder/encoder.h"
+#include "av1/qmode_rc/ducky_encode.h"
+#include "av1/qmode_rc/ratectrl_qmode.h"
+#include "av1/qmode_rc/ratectrl_qmode_interface.h"
 #include "test/video_source.h"
+#include "third_party/googletest/src/googlemock/include/gmock/gmock.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace aom {
@@ -35,7 +38,7 @@ TEST(DuckyEncodeTest, ComputeFirstPassStats) {
                            1,          "bus_352x288_420_f20_b8.yuv" };
   video_info.file_path =
       libaom_test::GetDataPath() + "/" + video_info.file_path;
-  DuckyEncode ducky_encode(video_info, kMaxRefFrames);
+  DuckyEncode ducky_encode(video_info, BLOCK_64X64, kMaxRefFrames, 3, 128);
   std::vector<FIRSTPASS_STATS> frame_stats =
       ducky_encode.ComputeFirstPassStats();
   EXPECT_EQ(frame_stats.size(), static_cast<size_t>(video_info.frame_count));
@@ -52,7 +55,7 @@ TEST(DuckyEncodeTest, EncodeFrame) {
                            17,         "bus_352x288_420_f20_b8.yuv" };
   video_info.file_path =
       libaom_test::GetDataPath() + "/" + video_info.file_path;
-  DuckyEncode ducky_encode(video_info, kMaxRefFrames);
+  DuckyEncode ducky_encode(video_info, BLOCK_64X64, kMaxRefFrames, 3, 128);
   std::vector<FIRSTPASS_STATS> frame_stats =
       ducky_encode.ComputeFirstPassStats();
   ducky_encode.StartEncode(frame_stats);
@@ -78,7 +81,7 @@ TEST(DuckyEncodeTest, EncodeFrameWithQindex) {
                            17,         "bus_352x288_420_f20_b8.yuv" };
   video_info.file_path =
       libaom_test::GetDataPath() + "/" + video_info.file_path;
-  DuckyEncode ducky_encode(video_info, kMaxRefFrames);
+  DuckyEncode ducky_encode(video_info, BLOCK_64X64, kMaxRefFrames, 3, 128);
   std::vector<FIRSTPASS_STATS> frame_stats =
       ducky_encode.ComputeFirstPassStats();
   ducky_encode.StartEncode(frame_stats);
@@ -90,16 +93,93 @@ TEST(DuckyEncodeTest, EncodeFrameWithQindex) {
   int q_index = 0;
   EncodeFrameDecision decision = { aom::EncodeFrameMode::kQindex,
                                    aom::EncodeGopMode::kNone,
-                                   { q_index, -1 } };
+                                   { q_index, -1, {}, {} } };
   for (int i = 0; i < coding_frame_count; ++i) {
     ducky_encode.AllocateBitstreamBuffer(video_info);
     EncodeFrameResult encode_frame_result = ducky_encode.EncodeFrame(decision);
-    // TODO(angiebird): Check why distortion is not zero when q_index = 0
     EXPECT_EQ(encode_frame_result.dist, 0);
   }
   ducky_encode.EndEncode();
 }
 
+TEST(DuckyEncodeRCTest, EncodeVideoWithRC) {
+  aom_rational_t frame_rate = { 30, 1 };
+  const int frame_number = 35;
+  const int frame_width = 352;
+  const int frame_height = 288;
+  VideoInfo video_info = { frame_width,  frame_height,
+                           frame_rate,   AOM_IMG_FMT_I420,
+                           frame_number, "bus_352x288_420_f20_b8.yuv" };
+  video_info.file_path =
+      libaom_test::GetDataPath() + "/" + video_info.file_path;
+  DuckyEncode ducky_encode(video_info, BLOCK_64X64, kMaxRefFrames, 3, 128);
+
+  AV1RateControlQMode qmode_rc;
+  RateControlParam rc_param = {};
+  rc_param.max_gop_show_frame_count = 16;
+  rc_param.min_gop_show_frame_count = 4;
+  rc_param.ref_frame_table_size = 5;
+  rc_param.max_ref_frames = 3;
+  rc_param.base_q_index = 45;
+  rc_param.max_distinct_q_indices_per_frame = 8;
+  rc_param.max_distinct_lambda_scales_per_frame = 1;
+  rc_param.frame_width = frame_width;
+  rc_param.frame_height = frame_height;
+  rc_param.tpl_pass_count = TplPassCount::kOneTplPass;
+  rc_param.tpl_pass_index = 0;
+  const Status status = qmode_rc.SetRcParam(rc_param);
+  ASSERT_TRUE(status.ok());
+  FirstpassInfo firstpass_info;
+  firstpass_info.stats_list = ducky_encode.ComputeFirstPassStats();
+  constexpr int kBlockSize = 16;
+  firstpass_info.num_mbs_16x16 = ((frame_width + kBlockSize - 1) / kBlockSize) *
+                                 ((frame_height + kBlockSize - 1) / kBlockSize);
+  const auto gop_info = qmode_rc.DetermineGopInfo(firstpass_info);
+  ASSERT_TRUE(gop_info.status().ok());
+  const GopStructList &gop_list = gop_info.value();
+
+  std::vector<aom::GopEncodeInfo> tpl_pass_gop_encode_info_list;
+  std::vector<aom::TplGopStats> tpl_gop_stats_list;
+  for (const auto &gop_struct : gop_list) {
+    const auto gop_encode_info =
+        qmode_rc.GetTplPassGopEncodeInfo(gop_struct, firstpass_info);
+    ASSERT_TRUE(gop_encode_info.status().ok());
+    tpl_pass_gop_encode_info_list.push_back(std::move(*gop_encode_info));
+  }
+
+  tpl_gop_stats_list = ducky_encode.ComputeTplStats(
+      firstpass_info.stats_list, gop_list, tpl_pass_gop_encode_info_list);
+
+  std::vector<aom::GopEncodeInfo> final_pass_gop_encode_info_list;
+  aom::RefFrameTable ref_frame_table;
+  for (size_t i = 0; i < gop_list.size(); ++i) {
+    const aom::GopStruct &gop_struct = gop_list[i];
+    const aom::TplGopStats &tpl_gop_stats = tpl_gop_stats_list[i];
+    std::vector<aom::LookaheadStats> lookahead_stats = {};
+    for (size_t lookahead_index = 1;
+         lookahead_index <= 1 && i + lookahead_index < gop_list.size();
+         ++lookahead_index) {
+      lookahead_stats.push_back({ &gop_list[i + lookahead_index],
+                                  &tpl_gop_stats_list[i + lookahead_index] });
+    }
+    const auto gop_encode_info =
+        qmode_rc.GetGopEncodeInfo(gop_struct, tpl_gop_stats, lookahead_stats,
+                                  firstpass_info, ref_frame_table);
+    ASSERT_TRUE(gop_encode_info.status().ok());
+    ref_frame_table = gop_encode_info.value().final_snapshot;
+    final_pass_gop_encode_info_list.push_back(std::move(*gop_encode_info));
+  }
+
+  ducky_encode.StartEncode(firstpass_info.stats_list);
+  std::vector<aom::EncodeFrameResult> encoded_frames_list =
+      ducky_encode.EncodeVideo(gop_list, final_pass_gop_encode_info_list);
+  ducky_encode.EndEncode();
+
+  EXPECT_THAT(encoded_frames_list,
+              testing::Each(testing::Field(
+                  "psnr", &aom::EncodeFrameResult::psnr, testing::Gt(37))));
+}
+
 TEST(DuckyEncodeTest, EncodeFrameMode) {
   EXPECT_EQ(DUCKY_ENCODE_FRAME_MODE_NONE,
             static_cast<DUCKY_ENCODE_FRAME_MODE>(EncodeFrameMode::kNone));
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 2566abec0..830388043 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -17,13 +17,14 @@
 
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
 
 namespace {
 
 #if CONFIG_REALTIME_ONLY
-const int kUsage = 1;
+const int kUsage = AOM_USAGE_REALTIME;
 #else
-const int kUsage = 0;
+const int kUsage = AOM_USAGE_GOOD_QUALITY;
 #endif
 
 TEST(EncodeAPI, InvalidParams) {
@@ -81,6 +82,30 @@ TEST(EncodeAPI, InvalidControlId) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+TEST(EncodeAPI, SetSFrameOnFirstFrame) {
+  constexpr int kWidth = 2;
+  constexpr int kHeight = 128;
+  unsigned char kBuffer[kWidth * kHeight * 3] = { 0 };
+  aom_image_t img;
+  ASSERT_EQ(aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1, kBuffer),
+            &img);
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, kUsage), AOM_CODEC_OK);
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+
+  aom_codec_ctx_t enc;
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+  // One of these aom_codec_encode() calls should fail.
+  if (aom_codec_encode(&enc, &img, 0, 1, AOM_EFLAG_SET_S_FRAME) ==
+      AOM_CODEC_OK) {
+    EXPECT_NE(aom_codec_encode(&enc, NULL, 0, 0, 0), AOM_CODEC_OK);
+  }
+  EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
 #if !CONFIG_REALTIME_ONLY
 TEST(EncodeAPI, AllIntraMode) {
   aom_codec_iface_t *iface = aom_codec_av1_cx();
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index ee09ea6af..c1b670986 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -229,7 +229,7 @@ void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
 
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int i = 0; i < width * height; i++) {
-    coeff[i] = rnd.Rand15Signed() + rnd.Rand15Signed();
+    coeff[i] = rnd.Rand16Signed();
   }
   for (int i = 0; i < TX_PAD_2D; i++) {
     levels_buf[0][i] = rnd.Rand8();
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index e4befd5f8..a6b442fbb 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -288,14 +288,6 @@ INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
                          ::testing::ValuesIn(kErrorBlockTestParamsAvx2));
 #endif  // HAVE_AVX2
 
-#if (HAVE_MSA)
-INSTANTIATE_TEST_SUITE_P(
-    MSA, ErrorBlockTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_msa>,
-                                 &BlockError8BitWrapper<av1_block_error_c>,
-                                 AOM_BITS_8)));
-#endif  // HAVE_MSA
-
 #if (HAVE_NEON)
 const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
   make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 1ef72c88a..84330d623 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -154,16 +154,15 @@ class ErrorResilienceTestLarge
                          AOM_EFLAG_NO_UPD_ARF)) ==
         (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
       ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_DROPPABLE,
-                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_DROPPABLE));
+                AOM_FRAME_IS_DROPPABLE);
     }
     if (encode_flags & AOM_EFLAG_SET_S_FRAME) {
       ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_SWITCH,
-                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_SWITCH));
+                AOM_FRAME_IS_SWITCH);
     }
     if (encode_flags & AOM_EFLAG_ERROR_RESILIENT) {
-      ASSERT_EQ(
-          pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT,
-          static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_ERROR_RESILIENT));
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT,
+                AOM_FRAME_IS_ERROR_RESILIENT);
     }
   }
 
diff --git a/test/force_key_frame_test.cc b/test/force_key_frame_test.cc
new file mode 100644
index 000000000..2b85d2653
--- /dev/null
+++ b/test/force_key_frame_test.cc
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/3327.
+//
+// In good-quality mode, set cfg.g_lag_in_frames to 1 or 0 and encode two
+// frames in one-pass mode. Pass AOM_EFLAG_FORCE_KF to the second
+// aom_codec_encode() call. Both frames should be encoded as key frames.
+
+#include <memory>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+void TestOnePassMode(unsigned int lag_in_frames) {
+  // A buffer of gray samples of size 128x128, YUV 4:2:0.
+  constexpr size_t kImageDataSize = 128 * 128 + 2 * 64 * 64;
+  std::unique_ptr<unsigned char[]> img_data(new unsigned char[kImageDataSize]);
+  ASSERT_NE(img_data, nullptr);
+  memset(img_data.get(), 128, kImageDataSize);
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+  cfg.g_w = 128;
+  cfg.g_h = 128;
+  cfg.g_pass = AOM_RC_ONE_PASS;
+  cfg.g_lag_in_frames = lag_in_frames;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+  aom_image_t img;
+  EXPECT_EQ(&img,
+            aom_img_wrap(&img, AOM_IMG_FMT_I420, 128, 128, 1, img_data.get()));
+
+  aom_codec_iter_t iter;
+  const aom_codec_cx_pkt_t *pkt;
+  int frame_count = 0;
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+  iter = nullptr;
+  while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+    EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+        << "frame " << frame_count;
+    frame_count++;
+  }
+
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_encode(&enc, &img, 1, 1, AOM_EFLAG_FORCE_KF));
+
+  iter = nullptr;
+  while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+    EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+        << "frame " << frame_count;
+    frame_count++;
+  }
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+
+  iter = nullptr;
+  while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+    EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+        << "frame " << frame_count;
+    frame_count++;
+  }
+
+  EXPECT_EQ(frame_count, 2);
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(ForceKeyFrameTest, OnePassModeLag0) { TestOnePassMode(0); }
+
+TEST(ForceKeyFrameTest, OnePassModeLag1) { TestOnePassMode(1); }
+
+TEST(ForceKeyFrameTest, OnePassModeLag2) { TestOnePassMode(2); }
+
+}  // namespace
diff --git a/test/forced_max_frame_width_height_test.cc b/test/forced_max_frame_width_height_test.cc
new file mode 100644
index 000000000..98d96fbaf
--- /dev/null
+++ b/test/forced_max_frame_width_height_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/3326.
+//
+// Set cfg.g_forced_max_frame_width and cfg.g_forced_max_frame_height and
+// encode two frames of increasing sizes. The second aom_codec_encode() should
+// not crash or have memory errors.
+
+#include <memory>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// cfg.g_lag_in_frames must be set to 0 or 1 to allow the frame size to change,
+// as required by the following check in encoder_set_config() in
+// av1/av1_cx_iface.c:
+//
+//   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+//     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+//       ERROR("Cannot change width or height after initialization");
+//     ...
+//   }
+
+void RunTest(unsigned int usage, unsigned int lag_in_frames,
+             const char *tune_metric) {
+  // A buffer of gray samples. Large enough for 128x128 and 256x256, YUV 4:2:0.
+  constexpr size_t kImageDataSize = 256 * 256 + 2 * 128 * 128;
+  std::unique_ptr<unsigned char[]> img_data(new unsigned char[kImageDataSize]);
+  ASSERT_NE(img_data, nullptr);
+  memset(img_data.get(), 128, kImageDataSize);
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, usage));
+  cfg.g_w = 128;
+  cfg.g_h = 128;
+  cfg.g_forced_max_frame_width = 256;
+  cfg.g_forced_max_frame_height = 256;
+  cfg.g_lag_in_frames = lag_in_frames;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_set_option(&enc, "tune", tune_metric));
+
+  aom_image_t img;
+  EXPECT_EQ(&img,
+            aom_img_wrap(&img, AOM_IMG_FMT_I420, 128, 128, 1, img_data.get()));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+  cfg.g_w = 256;
+  cfg.g_h = 256;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+
+  EXPECT_EQ(&img,
+            aom_img_wrap(&img, AOM_IMG_FMT_I420, 256, 256, 1, img_data.get()));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag0TunePSNR) {
+  RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/0, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag0TuneSSIM) {
+  RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/0, "ssim");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag1TunePSNR) {
+  RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/1, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag1TuneSSIM) {
+  RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/1, "ssim");
+}
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag0TunePSNR) {
+  RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/0, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag0TuneSSIM) {
+  RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/0, "ssim");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag1TunePSNR) {
+  RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/1, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag1TuneSSIM) {
+  RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/1, "ssim");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, MaxFrameSizeTooBig) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+  cfg.g_w = 256;
+  cfg.g_h = 256;
+  cfg.g_forced_max_frame_width = 131072;
+  cfg.g_forced_max_frame_height = 131072;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, FirstFrameTooBig) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+  cfg.g_w = 258;
+  cfg.g_h = 256;
+  cfg.g_forced_max_frame_width = 256;
+  cfg.g_forced_max_frame_height = 256;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  cfg.g_w = 256;
+  cfg.g_h = 258;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  cfg.g_w = 256;
+  cfg.g_h = 256;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, SecondFrameTooBig) {
+  // A buffer of gray samples. Large enough for 128x128 and 256x256, YUV 4:2:0.
+  constexpr size_t kImageDataSize = 256 * 256 + 2 * 128 * 128;
+  std::unique_ptr<unsigned char[]> img_data(new unsigned char[kImageDataSize]);
+  ASSERT_NE(img_data, nullptr);
+  memset(img_data.get(), 128, kImageDataSize);
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+  cfg.g_w = 128;
+  cfg.g_h = 128;
+  cfg.g_forced_max_frame_width = 255;
+  cfg.g_forced_max_frame_height = 256;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+  aom_image_t img;
+  EXPECT_EQ(&img,
+            aom_img_wrap(&img, AOM_IMG_FMT_I420, 128, 128, 1, img_data.get()));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+  cfg.g_w = 256;
+  cfg.g_h = 256;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_config_set(&enc, &cfg));
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+}  // namespace
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 2365a20c2..20aea31b0 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <array>
+#include <memory>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/video_source.h"
@@ -73,7 +76,116 @@ TEST_F(AV1FrameSizeTests, OneByOneVideo) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+// Parameters: AOM_USAGE_*, aom_rc_mode, cpu-used.
+class AV1ResolutionChange
+    : public testing::TestWithParam<std::tuple<int, aom_rc_mode, int>> {
+ public:
+  AV1ResolutionChange()
+      : usage_(std::get<0>(GetParam())), rc_mode_(std::get<1>(GetParam())),
+        cpu_used_(std::get<2>(GetParam())) {}
+  AV1ResolutionChange(const AV1ResolutionChange &) = delete;
+  AV1ResolutionChange &operator=(const AV1ResolutionChange &) = delete;
+  ~AV1ResolutionChange() override = default;
+
+ protected:
+  int usage_;
+  aom_rc_mode rc_mode_;
+  int cpu_used_;
+};
+
+TEST_P(AV1ResolutionChange, InvalidRefSize) {
+  struct FrameSize {
+    unsigned int width;
+    unsigned int height;
+  };
+  static constexpr std::array<FrameSize, 3> kFrameSizes = { {
+      { 1768, 200 },
+      { 50, 200 },
+      { 850, 200 },
+  } };
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage_), AOM_CODEC_OK);
+
+  // Resolution changes are only permitted with one pass encoding with no lag.
+  cfg.g_pass = AOM_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = rc_mode_;
+  // TODO(https://crbug.com/aomedia/3349): Setting g_w and g_h shouldn't be
+  // necessary due to the call to aom_codec_enc_config_set() at the start of
+  // the loop. Without this, however, there will be some heap overflows due to
+  // the default being a lower resolution (320x240).
+  cfg.g_w = kFrameSizes[0].width;
+  cfg.g_h = kFrameSizes[0].height;
+
+  aom_codec_ctx_t ctx;
+  EXPECT_EQ(aom_codec_enc_init(&ctx, iface, &cfg, 0), AOM_CODEC_OK);
+  std::unique_ptr<aom_codec_ctx_t, decltype(&aom_codec_destroy)> enc(
+      &ctx, &aom_codec_destroy);
+  EXPECT_EQ(aom_codec_control(enc.get(), AOME_SET_CPUUSED, cpu_used_),
+            AOM_CODEC_OK);
+
+  size_t frame_count = 0;
+  ::libaom_test::RandomVideoSource video;
+  video.Begin();
+  constexpr int kNumFramesPerResolution = 2;
+  for (const auto &frame_size : kFrameSizes) {
+    cfg.g_w = frame_size.width;
+    cfg.g_h = frame_size.height;
+    EXPECT_EQ(aom_codec_enc_config_set(enc.get(), &cfg), AOM_CODEC_OK);
+    video.SetSize(cfg.g_w, cfg.g_h);
+
+    aom_codec_iter_t iter;
+    const aom_codec_cx_pkt_t *pkt;
+
+    for (int i = 0; i < kNumFramesPerResolution; ++i) {
+      video.Next();  // SetSize() does not call FillFrame().
+      EXPECT_EQ(aom_codec_encode(enc.get(), video.img(), video.pts(),
+                                 video.duration(), /*flags=*/0),
+                AOM_CODEC_OK);
+
+      iter = nullptr;
+      while ((pkt = aom_codec_get_cx_data(enc.get(), &iter)) != nullptr) {
+        ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+        // The frame following a resolution change should be a keyframe as the
+        // change is too extreme to allow previous references to be used.
+        if (i == 0 || usage_ == AOM_USAGE_ALL_INTRA) {
+          EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+              << "frame " << frame_count;
+        }
+        frame_count++;
+      }
+    }
+  }
+
+  EXPECT_EQ(frame_count, kNumFramesPerResolution * kFrameSizes.size());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Realtime, AV1ResolutionChange,
+    ::testing::Combine(::testing::Values(AOM_USAGE_REALTIME),
+                       ::testing::Values(AOM_VBR, AOM_CBR),
+                       ::testing::Range(6, 11)));
+
 #if !CONFIG_REALTIME_ONLY
+// TODO(https://crbug.com/aomedia/3348): Modes that use av1_full_pixel_search()
+// will cause an assert.
+INSTANTIATE_TEST_SUITE_P(
+    DISABLED_GoodQuality, AV1ResolutionChange,
+    ::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
+                       ::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
+                       ::testing::Range(2, 6)));
+INSTANTIATE_TEST_SUITE_P(
+    DISABLED_GoodQualityLarge, AV1ResolutionChange,
+    ::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
+                       ::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
+                       ::testing::Range(0, 2)));
+INSTANTIATE_TEST_SUITE_P(
+    AllIntra, AV1ResolutionChange,
+    ::testing::Combine(::testing::Values(AOM_USAGE_ALL_INTRA),
+                       ::testing::Values(AOM_Q), ::testing::Range(6, 10)));
+
 typedef struct {
   unsigned int width;
   unsigned int height;
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 2b01cb8f7..0fe7f42b1 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -57,6 +57,15 @@ void ReferenceHadamard4x4(const int16_t *a, int a_stride, OutputType *b) {
   }
   for (int i = 0; i < 4; ++i) Hadamard4x4(input + i, buf + i * 4);
   for (int i = 0; i < 4; ++i) Hadamard4x4(buf + i, b + i * 4);
+
+  // Extra transpose to match C and SSE2 behavior(i.e., aom_hadamard_4x4).
+  for (int i = 0; i < 4; i++) {
+    for (int j = i + 1; j < 4; j++) {
+      OutputType temp = b[j * 4 + i];
+      b[j * 4 + i] = b[i * 4 + j];
+      b[i * 4 + j] = temp;
+    }
+  }
 }
 
 template <typename OutputType>
@@ -94,6 +103,16 @@ void ReferenceHadamard8x8(const int16_t *a, int a_stride, OutputType *b) {
   }
   for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
   for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
+
+  // Extra transpose to match SSE2 behavior (i.e., aom_hadamard_8x8 and
+  // aom_hadamard_lp_8x8).
+  for (int i = 0; i < 8; i++) {
+    for (int j = i + 1; j < 8; j++) {
+      OutputType temp = b[j * 8 + i];
+      b[j * 8 + i] = b[i * 8 + j];
+      b[i * 8 + j] = temp;
+    }
+  }
 }
 
 template <typename OutputType>
@@ -105,7 +124,8 @@ void ReferenceHadamard8x8Dual(const int16_t *a, int a_stride, OutputType *b) {
 }
 
 template <typename OutputType>
-void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b) {
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b,
+                            bool shift) {
   /* The source is a 16x16 block. The destination is rearranged to 8x32.
    * Input is 9 bit. */
   ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@@ -135,14 +155,27 @@ void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b) {
 
     ++b;
   }
+
+  if (shift) {
+    b -= 64;
+    // Extra shift to match aom_hadamard_16x16_c and aom_hadamard_16x16_avx2.
+    for (int i = 0; i < 16; i++) {
+      for (int j = 0; j < 4; j++) {
+        OutputType temp = b[i * 16 + 4 + j];
+        b[i * 16 + 4 + j] = b[i * 16 + 8 + j];
+        b[i * 16 + 8 + j] = temp;
+      }
+    }
+  }
 }
 
 template <typename OutputType>
-void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b) {
-  ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
-  ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
-  ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
-  ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b,
+                            bool shift) {
+  ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0, shift);
+  ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256, shift);
+  ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512, shift);
+  ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768, shift);
 
   for (int i = 0; i < 256; ++i) {
     const OutputType a0 = b[0];
@@ -166,11 +199,11 @@ void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b) {
 
 template <typename OutputType>
 void ReferenceHadamard(const int16_t *a, int a_stride, OutputType *b, int bw,
-                       int bh) {
+                       int bh, bool shift) {
   if (bw == 32 && bh == 32) {
-    ReferenceHadamard32x32(a, a_stride, b);
+    ReferenceHadamard32x32(a, a_stride, b, shift);
   } else if (bw == 16 && bh == 16) {
-    ReferenceHadamard16x16(a, a_stride, b);
+    ReferenceHadamard16x16(a, a_stride, b, shift);
   } else if (bw == 8 && bh == 8) {
     ReferenceHadamard8x8(a, a_stride, b);
   } else if (bw == 4 && bh == 4) {
@@ -199,10 +232,12 @@ template <typename OutputType, typename HadamardFuncType>
 class HadamardTestBase
     : public ::testing::TestWithParam<FuncWithSize<HadamardFuncType>> {
  public:
-  explicit HadamardTestBase(const FuncWithSize<HadamardFuncType> &func_param) {
+  HadamardTestBase(const FuncWithSize<HadamardFuncType> &func_param,
+                   bool do_shift) {
     h_func_ = func_param.func;
     bw_ = func_param.block_width;
     bh_ = func_param.block_height;
+    shift_ = do_shift;
   }
 
   virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
@@ -222,13 +257,8 @@ class HadamardTestBase
     memset(b_ref, 0, sizeof(b_ref));
 
     for (int i = 0; i < block_size_; ++i) a[i] = Rand();
-
-    ReferenceHadamard(a, bw_, b_ref, bw_, bh_);
+    ReferenceHadamard(a, bw_, b_ref, bw_, bh_, shift_);
     API_REGISTER_STATE_CHECK(h_func_(a, bw_, b));
-
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + block_size_);
-    std::sort(b_ref, b_ref + block_size_);
     EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
   }
 
@@ -246,12 +276,8 @@ class HadamardTestBase
       memset(b, 0, sizeof(b));
       memset(b_ref, 0, sizeof(b_ref));
 
-      ReferenceHadamard(a, i, b_ref, bw_, bh_);
+      ReferenceHadamard(a, i, b_ref, bw_, bh_, shift_);
       API_REGISTER_STATE_CHECK(h_func_(a, i, b));
-
-      // The order of the output is not important. Sort before checking.
-      std::sort(b, b + block_size_);
-      std::sort(b_ref, b_ref + block_size_);
       EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
     }
   }
@@ -280,11 +306,12 @@ class HadamardTestBase
   HadamardFuncType h_func_;
   int bw_;
   int bh_;
+  bool shift_;
 };
 
 class HadamardLowbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
  public:
-  HadamardLowbdTest() : HadamardTestBase(GetParam()) {}
+  HadamardLowbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
   virtual int16_t Rand() { return rnd_.Rand9Signed(); }
 };
 
@@ -317,6 +344,8 @@ INSTANTIATE_TEST_SUITE_P(
                       HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32, 32)));
 #endif  // HAVE_AVX2
 
+// TODO(aomedia:3314): Disable NEON unit test for now, since hadamard 16x16 NEON
+// need modifications to match C/AVX2 behavior.
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardLowbdTest,
@@ -327,7 +356,7 @@ INSTANTIATE_TEST_SUITE_P(
 // Tests for low precision
 class HadamardLowbdLPTest : public HadamardTestBase<int16_t, HadamardLPFunc> {
  public:
-  HadamardLowbdLPTest() : HadamardTestBase(GetParam()) {}
+  HadamardLowbdLPTest() : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
   virtual int16_t Rand() { return rnd_.Rand9Signed(); }
 };
 
@@ -371,7 +400,8 @@ INSTANTIATE_TEST_SUITE_P(
 class HadamardLowbdLP8x8DualTest
     : public HadamardTestBase<int16_t, HadamardLP8x8DualFunc> {
  public:
-  HadamardLowbdLP8x8DualTest() : HadamardTestBase(GetParam()) {}
+  HadamardLowbdLP8x8DualTest()
+      : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
   virtual int16_t Rand() { return rnd_.Rand9Signed(); }
 };
 
diff --git a/test/hash_test.cc b/test/hash_test.cc
index 5ce0fbb3d..61e0b5179 100644
--- a/test/hash_test.cc
+++ b/test/hash_test.cc
@@ -131,4 +131,11 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(kValidBlockSize)));
 #endif
 
+#if HAVE_ARM_CRC32
+INSTANTIATE_TEST_SUITE_P(
+    ARM_CRC32, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_arm_crc32),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
 }  // namespace
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index d1fd57844..2873490db 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -39,8 +39,8 @@ class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
     target_func_ = GET_PARAM(0);
   }
   virtual void TearDown() { aom_free(data_buf_); }
-  void RunHorverTest(void);
-  void RunHorverTest_ExtremeValues(void);
+  void RunHorverTest();
+  void RunHorverTest_ExtremeValues();
   void RunHorverSpeedTest(int run_times);
 
  private:
@@ -50,7 +50,7 @@ class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HorverTest);
 
-void HorverTest::RunHorverTest(void) {
+void HorverTest::RunHorverTest() {
   for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
     const int w = block_size_wide[block_size];
     const int h = block_size_high[block_size];
@@ -107,7 +107,7 @@ void HorverTest::RunHorverSpeedTest(int run_times) {
   }
 }
 
-void HorverTest::RunHorverTest_ExtremeValues(void) {
+void HorverTest::RunHorverTest_ExtremeValues() {
   for (int i = 0; i < MAX_SB_SQUARE; ++i) {
     // Most of get_horver_test is squaring and summing, so simply saturating
     // the whole buffer is mostly likely to cause an overflow.
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 12e14dc55..323aa93ce 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -56,7 +56,7 @@ const TestVideoParam kTestVideoVectors[] = {
     45.0 },
 #if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0,
-    48.0 },
+    47.9 },
 #endif
   { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 23.0, 56.0 },
   // Image coding (single frame).
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 261ab316f..1fc849ee7 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -146,7 +146,7 @@ const DecodeParam kAV1InvalidFileTests[] = {
   { 1, "invalid-oss-fuzz-10227.ivf", nullptr },
   { 4, "invalid-oss-fuzz-10555.ivf", nullptr },
   { 1, "invalid-oss-fuzz-10705.ivf", nullptr },
-  { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.3" },
   { 1, "invalid-oss-fuzz-10779.ivf", nullptr },
   { 1, "invalid-oss-fuzz-11477.ivf", nullptr },
   { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
diff --git a/test/kf_test.cc b/test/kf_test.cc
index 0cef8db04..5daf60068 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc
@@ -84,8 +84,7 @@ class KeyFrameIntervalTestLarge
           is_kf_interval_violated_ = true;
         }
       }
-      if ((frame_flags & AOM_FRAME_IS_KEY) ==
-          static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY)) {
+      if ((frame_flags & AOM_FRAME_IS_KEY) == AOM_FRAME_IS_KEY) {
         if (kf_dist_ != -1 && kf_dist_ < (int)kf_dist_param_.min_kf_dist) {
           is_kf_interval_violated_ = true;
         }
@@ -186,8 +185,7 @@ class ForcedKeyTestLarge
         int frame_flags = 0;
         AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
                                       &frame_flags);
-        if ((frame_flags & AOM_FRAME_IS_KEY) !=
-            static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY)) {
+        if ((frame_flags & AOM_FRAME_IS_KEY) != AOM_FRAME_IS_KEY) {
           is_kf_placement_violated_ = true;
         }
       }
diff --git a/test/level_test.cc b/test/level_test.cc
index 9efd6c8f7..8759b9b43 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -22,7 +22,7 @@
 namespace {
 const int kLevelMin = 0;
 const int kLevelMax = 31;
-const int kLevelKeepStats = 24;
+const int kLevelKeepStats = 32;
 // Speed settings tested
 static const int kCpuUsedVectors[] = {
   1,
@@ -87,10 +87,10 @@ TEST_P(LevelTest, TestTargetLevelApi) {
   for (int operating_point = 0; operating_point <= 32; ++operating_point) {
     for (int level = 0; level <= 32; ++level) {
       const int target_level = operating_point * 100 + level;
-      if ((level <= 24 && level != 2 && level != 3 && level != 6 &&
-           level != 7 && level != 10 && level != 11 && level != 20 &&
-           level != 21 && level != 22 && level != 23) ||
-          level == 31 || operating_point > 31) {
+      if ((level < 28 && level != 2 && level != 3 && level != 6 && level != 7 &&
+           level != 10 && level != 11) ||
+          level == kLevelMax || level == kLevelKeepStats ||
+          operating_point > 31) {
         EXPECT_EQ(AOM_CODEC_OK,
                   AOM_CODEC_CONTROL_TYPECHECKED(
                       &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 6960fd3e6..421fdef52 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -587,6 +587,7 @@ const loop_param_t kLoop8Test6Avx2[] = {
   make_tuple(&aom_lpf_horizontal_8_quad_avx2, &aom_lpf_horizontal_8_quad_c, 8),
   make_tuple(&aom_lpf_horizontal_14_quad_avx2, &aom_lpf_horizontal_14_quad_c,
              8),
+  make_tuple(&aom_lpf_vertical_14_quad_avx2, &aom_lpf_vertical_14_quad_c, 8),
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test6Param_lbd,
diff --git a/test/md5_helper.h b/test/md5_helper.h
index 9443cb262..69f1ae76b 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -48,7 +48,7 @@ class MD5 {
     MD5Update(&md5_, data, static_cast<uint32_t>(size));
   }
 
-  const char *Get(void) {
+  const char *Get() {
     static const char hex[16] = {
       '0', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
diff --git a/test/mock_ratectrl_qmode.h b/test/mock_ratectrl_qmode.h
index 2dbd8eff7..9c9e6e816 100644
--- a/test/mock_ratectrl_qmode.h
+++ b/test/mock_ratectrl_qmode.h
@@ -12,7 +12,7 @@
 #ifndef AOM_TEST_MOCK_RATECTRL_QMODE_H_
 #define AOM_TEST_MOCK_RATECTRL_QMODE_H_
 
-#include "av1/ratectrl_qmode_interface.h"
+#include "av1/qmode_rc/ratectrl_qmode_interface.h"
 #include "third_party/googletest/src/googlemock/include/gmock/gmock.h"
 
 namespace aom {
@@ -25,8 +25,21 @@ class MockRateControlQMode : public AV1RateControlQModeInterface {
               (const FirstpassInfo &firstpass_info), (override));
   MOCK_METHOD(StatusOr<GopEncodeInfo>, GetGopEncodeInfo,
               (const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+               const std::vector<LookaheadStats> &lookahead_stats,
                const RefFrameTable &ref_frame_table_snapshot_init),
               (override));
+  MOCK_METHOD(StatusOr<GopEncodeInfo>, GetGopEncodeInfo,
+              (const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+               const std::vector<LookaheadStats> &lookahead_stats,
+               const FirstpassInfo &firstpass_info,
+               const RefFrameTable &ref_frame_table_snapshot_init),
+              (override));
+  MOCK_METHOD(StatusOr<GopEncodeInfo>, GetTplPassGopEncodeInfo,
+              (const GopStruct &gop_struct), (override));
+  MOCK_METHOD(StatusOr<GopEncodeInfo>, GetTplPassGopEncodeInfo,
+              (const GopStruct &gop_struct,
+               const FirstpassInfo &firstpass_info),
+              (override));
 };
 
 }  // namespace aom
diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index a71cc9b3d..c5229fc1d 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc
@@ -24,14 +24,14 @@ const unsigned int kCqLevel = 18;
 const double kMaxPsnr = 100.0;
 
 // kPsnrThreshold represents the psnr threshold used to validate the quality of
-// the first frame. The indices, 0 and 1 correspond to non-allintra and allintra
-// encoding modes.
-const double kPsnrThreshold[2] = { 29.0, 41.5 };
+// the first frame. The indices correspond to one/two-pass, allintra and
+// realtime encoding modes.
+const double kPsnrThreshold[3] = { 29.0, 41.5, 41.5 };
 
 // kPsnrFluctuation represents the maximum allowed psnr fluctuation w.r.t first
-// frame. The indices, 0 and 1 correspond to non-allintra and allintra encoding
-// modes.
-const double kPsnrFluctuation[2] = { 2.5, 0.3 };
+// frame. The indices correspond to one/two-pass, allintra and realtime
+// encoding modes.
+const double kPsnrFluctuation[3] = { 2.5, 0.3, 16.0 };
 
 class MonochromeTest
     : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
@@ -102,15 +102,17 @@ class MonochromeTest
       EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
       return;
     }
-    const bool is_allintra = (mode_ == ::libaom_test::kAllIntra);
+    const int psnr_index = (mode_ == ::libaom_test::kRealTime)   ? 2
+                           : (mode_ == ::libaom_test::kAllIntra) ? 1
+                                                                 : 0;
     // Check that the initial Y PSNR value is 'high enough', and check that
     // subsequent Y PSNR values are 'close' to this initial value.
     if (frame0_psnr_y_ == 0.0) {
       frame0_psnr_y_ = pkt->data.psnr.psnr[1];
-      EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[is_allintra]);
+      EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[psnr_index]);
     }
     EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_,
-                kPsnrFluctuation[is_allintra]);
+                kPsnrFluctuation[psnr_index]);
   }
 
   int lossless_;
@@ -171,6 +173,27 @@ TEST_P(MonochromeAllIntraTest, TestMonochromeEncoding) {
   }
 }
 
+class MonochromeRealtimeTest : public MonochromeTest {};
+
+TEST_P(MonochromeRealtimeTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 30);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+  // Run at low bitrate.
+  cfg_.rc_target_bitrate = 40;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
 AV1_INSTANTIATE_TEST_SUITE(MonochromeTest,
                            ::testing::Values(::libaom_test::kOnePassGood,
                                              ::libaom_test::kTwoPassGood),
@@ -181,4 +204,10 @@ AV1_INSTANTIATE_TEST_SUITE(MonochromeAllIntraTest,
                            ::testing::Values(::libaom_test::kAllIntra),
                            ::testing::Values(0, 1),   // lossless
                            ::testing::Values(6, 9));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(MonochromeRealtimeTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(0),          // lossless
+                           ::testing::Values(6, 8, 10));  // cpu_used
+
 }  // namespace
diff --git a/test/postproc_filters_test.cc b/test/postproc_filters_test.cc
new file mode 100644
index 000000000..37de5d2cb
--- /dev/null
+++ b/test/postproc_filters_test.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+class PostprocFiltersTest
+    : public ::libaom_test::CodecTestWith2Params<int, unsigned int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  PostprocFiltersTest()
+      : EncoderTest(GET_PARAM(0)), set_skip_postproc_filtering_(false),
+        frame_number_(0), cpu_used_(GET_PARAM(1)), bd_(GET_PARAM(2)) {}
+
+  virtual void SetUp() {
+    InitializeConfig(::libaom_test::kAllIntra);
+    cfg_.g_input_bit_depth = bd_;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    frame_number_ = video->frame();
+    if (frame_number_ == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+    }
+    if (set_skip_postproc_filtering_) {
+      if (frame_number_ == 0) {
+        encoder->Control(AV1E_SET_SKIP_POSTPROC_FILTERING, 1);
+      } else if (frame_number_ == 10) {
+        encoder->Control(AV1E_SET_SKIP_POSTPROC_FILTERING, 0);
+      } else if (frame_number_ == 20) {
+        encoder->Control(AV1E_SET_SKIP_POSTPROC_FILTERING, 1);
+      }
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+  }
+
+  virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+    const aom_image_t *img_enc = encoder->GetPreviewFrame();
+    if (!set_skip_postproc_filtering_) {
+      ASSERT_NE(img_enc, nullptr);
+    } else {
+      // Null will be returned if we query the reconstructed frame when
+      // AV1E_SET_SKIP_POSTPROC_FILTERING is set to 1.
+      if (frame_number_ < 10) {
+        ASSERT_EQ(img_enc, nullptr);
+      } else if (frame_number_ < 20) {
+        // Reconstructed frame cannot be null when
+        // AV1E_SET_SKIP_POSTPROC_FILTERING is set to 0.
+        ASSERT_NE(img_enc, nullptr);
+      } else {
+        ASSERT_EQ(img_enc, nullptr);
+      }
+    }
+  }
+
+  // The encoder config flag 'AV1E_SET_SKIP_POSTPROC_FILTERING' can be used to
+  // skip the application of post-processing filters on reconstructed frame for
+  // ALLINTRA encode. This unit-test validates the bit exactness of 2 encoded
+  // streams with 'AV1E_SET_SKIP_POSTPROC_FILTERING':
+  // 1. disabled for all frames (default case)
+  // 2. enabled and disabled at different frame indices using control calls.
+  void DoTest() {
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::YUVVideoSource("niklas_640_480_30.yuv",
+                                        AOM_IMG_FMT_I420, 640, 480, 30, 1, 0,
+                                        kFrames));
+    ASSERT_NE(video, nullptr);
+
+    // First encode: 'AV1E_SET_SKIP_POSTPROC_FILTERING' disabled for all frames
+    // (default case).
+    set_skip_postproc_filtering_ = false;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    std::vector<std::string> apply_postproc_filters_md5_enc =
+        std::move(md5_enc_);
+    md5_enc_.clear();
+
+    // Second encode: 'AV1E_SET_SKIP_POSTPROC_FILTERING' enabled and disabled at
+    // different frame intervals.
+    set_skip_postproc_filtering_ = true;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    std::vector<std::string> toggle_apply_postproc_filters_md5_enc =
+        std::move(md5_enc_);
+    md5_enc_.clear();
+
+    // Check for bit match.
+    ASSERT_EQ(apply_postproc_filters_md5_enc,
+              toggle_apply_postproc_filters_md5_enc);
+  }
+
+  bool set_skip_postproc_filtering_;
+  unsigned int frame_number_;
+  std::vector<std::string> md5_enc_;
+
+ private:
+  static constexpr int kFrames = 30;
+  static constexpr unsigned int kCqLevel = 18;
+  int cpu_used_;
+  unsigned int bd_;
+};
+
+class PostprocFiltersTestLarge : public PostprocFiltersTest {};
+
+TEST_P(PostprocFiltersTest, MD5Match) { DoTest(); }
+
+TEST_P(PostprocFiltersTestLarge, MD5Match) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(PostprocFiltersTest, ::testing::Values(9),
+                           ::testing::Values(8, 10));
+
+// Test cpu_used 3 and 6.
+AV1_INSTANTIATE_TEST_SUITE(PostprocFiltersTestLarge, ::testing::Values(3, 6),
+                           ::testing::Values(8, 10));
+
+}  // namespace
diff --git a/test/ratectrl_qmode_test.cc b/test/ratectrl_qmode_test.cc
index d950626cd..fa0c19a44 100644
--- a/test/ratectrl_qmode_test.cc
+++ b/test/ratectrl_qmode_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/ratectrl_qmode.h"
+#include "av1/qmode_rc/ratectrl_qmode.h"
 
 #include <algorithm>
 #include <array>
@@ -18,11 +18,13 @@
 #include <fstream>
 #include <memory>
 #include <numeric>
+#include <random>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
-#include "av1/reference_manager.h"
-#include "av1/ducky_encode.h"
+#include "av1/qmode_rc/ducky_encode.h"
+#include "av1/qmode_rc/reference_manager.h"
 #include "test/mock_ratectrl_qmode.h"
 #include "test/video_source.h"
 #include "third_party/googletest/src/googlemock/include/gmock/gmock.h"
@@ -35,6 +37,7 @@ using ::testing::HasSubstr;
 constexpr int kRefFrameTableSize = 7;
 constexpr int kFrameWidth = 352;
 constexpr int kFrameHeight = 288;
+constexpr int kFrameLimit = 250;
 
 MATCHER(IsOkStatus, "") {
   *result_listener << "with code " << arg.code
@@ -60,7 +63,8 @@ std::string ReadDouble(std::istream &stream, double *value) {
 }
 
 void ReadFirstpassInfo(const std::string &filename,
-                       aom::FirstpassInfo *firstpass_info) {
+                       aom::FirstpassInfo *firstpass_info,
+                       const int frame_limit) {
   // These golden files are generated by the following command line:
   // ./aomenc --width=352 --height=288 --fps=30/1 --limit=250 --codec=av1
   // --cpu-used=3 --end-usage=q --cq-level=36 --threads=0 --profile=0
@@ -80,7 +84,9 @@ void ReadFirstpassInfo(const std::string &filename,
   firstpass_info->num_mbs_16x16 =
       (kFrameWidth / 16 + 1) * (kFrameHeight / 16 + 1);
   std::string newline;
-  while (std::getline(firstpass_stats_file, newline)) {
+  int frame_number = 0;
+  while (std::getline(firstpass_stats_file, newline) &&
+         frame_number < frame_limit) {
     std::istringstream iss(newline);
     FIRSTPASS_STATS firstpass_stats_input = {};
     ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.frame), "");
@@ -115,6 +121,8 @@ void ReadFirstpassInfo(const std::string &filename,
                            << firstpass_info->stats_list.size() + 1 << "\n"
                            << newline;
     firstpass_info->stats_list.push_back(firstpass_stats_input);
+
+    frame_number++;
   }
 }
 }  // namespace
@@ -226,13 +234,12 @@ class RateControlQModeTest : public ::testing::Test {
     rc_param_.min_gop_show_frame_count = 4;
     rc_param_.ref_frame_table_size = 7;
     rc_param_.max_ref_frames = 7;
-    rc_param_.max_depth = 5;
     rc_param_.base_q_index = 128;
     rc_param_.frame_height = kFrameHeight;
     rc_param_.frame_width = kFrameWidth;
   }
 
-  RateControlParam rc_param_;
+  RateControlParam rc_param_ = {};
 };
 
 TEST_F(RateControlQModeTest, ConstructGopARF) {
@@ -240,7 +247,7 @@ TEST_F(RateControlQModeTest, ConstructGopARF) {
   const bool has_key_frame = false;
   const int global_coding_idx_offset = 5;
   const int global_order_idx_offset = 20;
-  RefFrameManager ref_frame_manager(kRefFrameTableSize);
+  RefFrameManager ref_frame_manager(kRefFrameTableSize, 7);
   GopStruct gop_struct =
       ConstructGop(&ref_frame_manager, show_frame_count, has_key_frame,
                    global_coding_idx_offset, global_order_idx_offset);
@@ -249,8 +256,7 @@ TEST_F(RateControlQModeTest, ConstructGopARF) {
   TestGopGlobalOrderIdx(gop_struct, global_order_idx_offset);
   TestGopGlobalCodingIdx(gop_struct, global_coding_idx_offset);
   TestColocatedShowFrame(gop_struct);
-  const int max_layer_depth =
-      ref_frame_manager.ForwardMaxSize() + kLayerDepthOffset;
+  const int max_layer_depth = ref_frame_manager.MaxRefFrame();
   TestLayerDepth(gop_struct, max_layer_depth);
   TestArfInterval(gop_struct);
 }
@@ -260,7 +266,7 @@ TEST_F(RateControlQModeTest, ConstructGopKey) {
   const bool has_key_frame = true;
   const int global_coding_idx_offset = 10;
   const int global_order_idx_offset = 8;
-  RefFrameManager ref_frame_manager(kRefFrameTableSize);
+  RefFrameManager ref_frame_manager(kRefFrameTableSize, 7);
   GopStruct gop_struct =
       ConstructGop(&ref_frame_manager, show_frame_count, has_key_frame,
                    global_coding_idx_offset, global_order_idx_offset);
@@ -269,8 +275,7 @@ TEST_F(RateControlQModeTest, ConstructGopKey) {
   TestGopGlobalOrderIdx(gop_struct, global_order_idx_offset);
   TestGopGlobalCodingIdx(gop_struct, global_coding_idx_offset);
   TestColocatedShowFrame(gop_struct);
-  const int max_layer_depth =
-      ref_frame_manager.ForwardMaxSize() + kLayerDepthOffset;
+  const int max_layer_depth = ref_frame_manager.MaxRefFrame();
   TestLayerDepth(gop_struct, max_layer_depth);
   TestArfInterval(gop_struct);
 }
@@ -280,7 +285,7 @@ TEST_F(RateControlQModeTest, ConstructShortGop) {
   const bool has_key_frame = false;
   const int global_coding_idx_offset = 5;
   const int global_order_idx_offset = 20;
-  RefFrameManager ref_frame_manager(kRefFrameTableSize);
+  RefFrameManager ref_frame_manager(kRefFrameTableSize, 7);
   GopStruct gop_struct =
       ConstructGop(&ref_frame_manager, show_frame_count, has_key_frame,
                    global_coding_idx_offset, global_order_idx_offset);
@@ -316,6 +321,7 @@ static TplFrameStats CreateToyTplFrameStatsWithDiffSizes(int min_block_size,
   frame_stats.min_block_size = min_block_size;
   frame_stats.frame_height = max_h * count;
   frame_stats.frame_width = max_w * count;
+  frame_stats.rate_dist_present = false;
   for (int i = 0; i < count; ++i) {
     for (int j = 0; j < count; ++j) {
       int h = max_h >> i;
@@ -372,7 +378,7 @@ double TplFrameStatsAccumulateIntraCost(const TplFrameStats &frame_stats) {
   for (auto &block_stats : frame_stats.block_stats_list) {
     sum += block_stats.intra_cost;
   }
-  return sum;
+  return std::max(sum, 1.0);
 }
 
 TEST_F(RateControlQModeTest, CreateTplFrameDepStats) {
@@ -578,6 +584,7 @@ TEST_F(RateControlQModeTest, TplFrameDepStatsPropagateSingleWithMotion) {
   }
 }
 
+// TODO(jianj): Add tests for non empty lookahead stats.
 TEST_F(RateControlQModeTest, ComputeTplGopDepStats) {
   TplGopStats tpl_gop_stats;
   std::vector<RefFrameTable> ref_frame_table_list;
@@ -595,7 +602,7 @@ TEST_F(RateControlQModeTest, ComputeTplGopDepStats) {
     ref_frame_table_list.push_back(CreateToyRefFrameTable(i));
   }
   const StatusOr<TplGopDepStats> gop_dep_stats =
-      ComputeTplGopDepStats(tpl_gop_stats, ref_frame_table_list);
+      ComputeTplGopDepStats(tpl_gop_stats, {}, ref_frame_table_list);
   ASSERT_THAT(gop_dep_stats.status(), IsOkStatus());
 
   double expected_sum = 0;
@@ -622,7 +629,7 @@ TEST(RefFrameManagerTest, GetRefFrameCount) {
     GopFrameType::kRegularLeaf,
     GopFrameType::kOverlay
   };
-  RefFrameManager ref_manager(kRefFrameTableSize);
+  RefFrameManager ref_manager(kRefFrameTableSize, 7);
   int coding_idx = 0;
   const int first_leaf_idx = 3;
   EXPECT_EQ(type_list[first_leaf_idx], GopFrameType::kRegularLeaf);
@@ -654,8 +661,8 @@ TEST(RefFrameManagerTest, GetRefFrameCount) {
   // After the first kShowExisting, the kIntermediateArf should be moved from
   // kForward to kLast due to the cur_global_order_idx_ update
   EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 1);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 1);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 1);
 
   const int second_leaf_idx = 5;
   EXPECT_EQ(type_list[second_leaf_idx], GopFrameType::kRegularLeaf);
@@ -669,8 +676,8 @@ TEST(RefFrameManagerTest, GetRefFrameCount) {
   EXPECT_EQ(ref_manager.CurGlobalOrderIdx(), 3);
   // An additional kRegularLeaf frame is added into kLast
   EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 1);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 1);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 3);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 2);
 
   const int first_overlay_idx = 6;
   EXPECT_EQ(type_list[first_overlay_idx], GopFrameType::kOverlay);
@@ -686,8 +693,8 @@ TEST(RefFrameManagerTest, GetRefFrameCount) {
   // After the kOverlay, the kRegularArf should be moved from
   // kForward to kBackward due to the cur_global_order_idx_ update
   EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 0);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 2);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 3);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 3);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 2);
 }
 
 void TestRefFrameManagerPriority(const RefFrameManager &ref_manager,
@@ -723,7 +730,7 @@ TEST(RefFrameManagerTest, GetRefFrameByPriority) {
     GopFrameType::kRegularLeaf,
     GopFrameType::kOverlay
   };
-  RefFrameManager ref_manager(kRefFrameTableSize);
+  RefFrameManager ref_manager(kRefFrameTableSize, 7);
   int coding_idx = 0;
   const int first_leaf_idx = 3;
   EXPECT_EQ(type_list[first_leaf_idx], GopFrameType::kRegularLeaf);
@@ -746,9 +753,9 @@ TEST(RefFrameManagerTest, GetRefFrameByPriority) {
     ref_manager.UpdateRefFrameTable(&gop_frame);
   }
 
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 3);
   TestRefFrameManagerPriority(ref_manager, RefUpdateType::kBackward);
-  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 3);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 2);
   TestRefFrameManagerPriority(ref_manager, RefUpdateType::kLast);
 }
 
@@ -759,7 +766,7 @@ TEST(RefFrameManagerTest, GetRefFrameListByPriority) {
                                                 GopFrameType::kRegularArf,
                                                 GopFrameType::kIntermediateArf,
                                                 GopFrameType::kRegularLeaf };
-  RefFrameManager ref_manager(kRefFrameTableSize);
+  RefFrameManager ref_manager(kRefFrameTableSize, 7);
   for (int coding_idx = 0; coding_idx < frame_count; ++coding_idx) {
     GopFrame gop_frame =
         GopFrameBasic(0, 0, coding_idx, order_idx_list[coding_idx], 0, 0,
@@ -794,7 +801,7 @@ TEST(RefFrameManagerTest, GetPrimaryRefFrame) {
                                                 GopFrameType::kIntermediateArf,
                                                 GopFrameType::kRegularLeaf };
   const std::vector<int> layer_depth_list = { 0, 2, 4, 6 };
-  RefFrameManager ref_manager(kRefFrameTableSize);
+  RefFrameManager ref_manager(kRefFrameTableSize, 7);
   for (int coding_idx = 0; coding_idx < frame_count; ++coding_idx) {
     GopFrame gop_frame =
         GopFrameBasic(0, 0, coding_idx, order_idx_list[coding_idx],
@@ -808,6 +815,7 @@ TEST(RefFrameManagerTest, GetPrimaryRefFrame) {
     // Set different frame type
     GopFrameType type = type_list[(i + 1) % frame_count];
     GopFrame gop_frame = GopFrameBasic(0, 0, 0, 0, layer_depth, 0, type);
+    gop_frame.ref_frame_list = ref_manager.GetRefFrameListByPriority();
     ReferenceFrame ref_frame = ref_manager.GetPrimaryRefFrame(gop_frame);
     GopFrame primary_ref_frame =
         ref_manager.GetRefFrameByIndex(ref_frame.index);
@@ -823,6 +831,7 @@ TEST(RefFrameManagerTest, GetPrimaryRefFrame) {
     // Let the frame layer_depth sit in the middle of two reference frames
     int layer_depth = mid_layer_depth_list[i];
     GopFrame gop_frame = GopFrameBasic(0, 0, 0, 0, layer_depth, 0, type);
+    gop_frame.ref_frame_list = ref_manager.GetRefFrameListByPriority();
     ReferenceFrame ref_frame = ref_manager.GetPrimaryRefFrame(gop_frame);
     GopFrame primary_ref_frame =
         ref_manager.GetRefFrameByIndex(ref_frame.index);
@@ -836,7 +845,7 @@ TEST_F(RateControlQModeTest, TestKeyframeDetection) {
   FirstpassInfo firstpass_info;
   const std::string kFirstpassStatsFile = "firstpass_stats";
   ASSERT_NO_FATAL_FAILURE(
-      ReadFirstpassInfo(kFirstpassStatsFile, &firstpass_info));
+      ReadFirstpassInfo(kFirstpassStatsFile, &firstpass_info, kFrameLimit));
   EXPECT_THAT(GetKeyFrameList(firstpass_info),
               ElementsAre(0, 30, 60, 90, 120, 150, 180, 210, 240));
 }
@@ -918,13 +927,6 @@ TEST_F(RateControlQModeTest, TestInvalidMaxRefFrames) {
               HasSubstr("max_ref_frames (8) must be in the range"));
 }
 
-TEST_F(RateControlQModeTest, TestInvalidMaxDepth) {
-  rc_param_.max_depth = 6;
-  Status status = AV1RateControlQMode().SetRcParam(rc_param_);
-  EXPECT_EQ(status.code, AOM_CODEC_INVALID_PARAM);
-  EXPECT_THAT(status.message, HasSubstr("max_depth (6) must be in the range"));
-}
-
 TEST_F(RateControlQModeTest, TestInvalidBaseQIndex) {
   rc_param_.base_q_index = 256;
   Status status = AV1RateControlQMode().SetRcParam(rc_param_);
@@ -963,7 +965,7 @@ TEST_F(RateControlQModeTest, TestGetRefFrameTableListFirstGop) {
       // For the first GOP only, GetRefFrameTableList can be passed a
       // default-constructed RefFrameTable (because it's all going to be
       // replaced by the key frame anyway).
-      rc.GetRefFrameTableList(gop_struct, RefFrameTable()),
+      rc.GetRefFrameTableList(gop_struct, {}, RefFrameTable()),
       ElementsAre(
           ElementsAre(matches_invalid, matches_invalid, matches_invalid),
           ElementsAre(matches_frame0, matches_frame0, matches_frame0),
@@ -993,7 +995,7 @@ TEST_F(RateControlQModeTest, TestGetRefFrameTableListNotFirstGop) {
   gop_struct.global_coding_idx_offset = 5;  // This is not the first GOP.
   gop_struct.gop_frame_list = { frame0, frame1, frame2 };
   ASSERT_THAT(
-      rc.GetRefFrameTableList(gop_struct, RefFrameTable(3, previous)),
+      rc.GetRefFrameTableList(gop_struct, {}, RefFrameTable(3, previous)),
       ElementsAre(
           ElementsAre(matches_previous, matches_previous, matches_previous),
           ElementsAre(matches_previous, matches_previous, matches_frame0),
@@ -1004,7 +1006,7 @@ TEST_F(RateControlQModeTest, TestGetRefFrameTableListNotFirstGop) {
 TEST_F(RateControlQModeTest, TestGopIntervals) {
   FirstpassInfo firstpass_info;
   ASSERT_NO_FATAL_FAILURE(
-      ReadFirstpassInfo("firstpass_stats", &firstpass_info));
+      ReadFirstpassInfo("firstpass_stats", &firstpass_info, kFrameLimit));
   AV1RateControlQMode rc;
   ASSERT_THAT(rc.SetRcParam(rc_param_), IsOkStatus());
 
@@ -1018,12 +1020,15 @@ TEST_F(RateControlQModeTest, TestGopIntervals) {
               ElementsAre(21, 9, 30, 30, 16, 14, 21, 9, 30, 12, 16, 2, 30, 10));
 }
 
+// TODO(b/242892473): Add a test which passes lookahead GOPs.
 TEST_F(RateControlQModeTest, TestGetGopEncodeInfo) {
   FirstpassInfo firstpass_info;
   ASSERT_NO_FATAL_FAILURE(
-      ReadFirstpassInfo("firstpass_stats", &firstpass_info));
+      ReadFirstpassInfo("firstpass_stats", &firstpass_info, 50));
   AV1RateControlQMode rc;
   rc_param_.max_gop_show_frame_count = 16;
+  rc_param_.max_ref_frames = 3;
+  rc_param_.base_q_index = 117;
   ASSERT_THAT(rc.SetRcParam(rc_param_), IsOkStatus());
   const auto gop_info = rc.DetermineGopInfo(firstpass_info);
   ASSERT_THAT(gop_info.status(), IsOkStatus());
@@ -1032,25 +1037,39 @@ TEST_F(RateControlQModeTest, TestGetGopEncodeInfo) {
   const aom::VideoInfo input_video = {
     kFrameWidth, kFrameHeight,
     frame_rate,  AOM_IMG_FMT_I420,
-    250,         libaom_test::GetDataPath() + "/hantro_collage_w352h288.yuv"
+    50,          libaom_test::GetDataPath() + "/hantro_collage_w352h288.yuv"
   };
-  DuckyEncode ducky_encode(input_video, 3, 3);
-  ducky_encode.StartEncode(firstpass_info.stats_list);
+  DuckyEncode ducky_encode(input_video, BLOCK_64X64, rc_param_.max_ref_frames,
+                           3, rc_param_.base_q_index);
+
+  std::vector<aom::GopEncodeInfo> gop_encode_info_list;
+  for (const auto &gop_struct : gop_list) {
+    const auto gop_encode_info =
+        rc.GetTplPassGopEncodeInfo(gop_struct, firstpass_info);
+    ASSERT_TRUE(gop_encode_info.ok());
+    gop_encode_info_list.push_back(gop_encode_info.value());
+  }
+
   // Read TPL stats
-  std::vector<TplGopStats> tpl_gop_list =
-      ducky_encode.ComputeTplStats(gop_list);
-  ducky_encode.EndEncode();
+  std::vector<TplGopStats> tpl_gop_list = ducky_encode.ComputeTplStats(
+      firstpass_info.stats_list, gop_list, gop_encode_info_list);
+
   RefFrameTable ref_frame_table;
   int num_gop_skipped = 0;
   for (size_t gop_idx = 0; gop_idx < gop_list.size(); gop_idx++) {
     size_t tpl_gop_idx = gop_idx - num_gop_skipped;
-    const auto gop_encode_info = rc.GetGopEncodeInfo(
-        gop_list[gop_idx], tpl_gop_list[tpl_gop_idx], ref_frame_table);
+    const auto gop_encode_info =
+        rc.GetGopEncodeInfo(gop_list[gop_idx], tpl_gop_list[tpl_gop_idx], {},
+                            firstpass_info, ref_frame_table);
     ASSERT_THAT(gop_encode_info.status(), IsOkStatus());
     for (auto &frame_param : gop_encode_info->param_list) {
-      std::cout << frame_param.q_index << std::endl;
+      EXPECT_LE(frame_param.q_index, rc_param_.base_q_index);
     }
     ref_frame_table = gop_encode_info->final_snapshot;
+    for (auto &gop_frame : ref_frame_table) {
+      EXPECT_LE(static_cast<int>(gop_frame.ref_frame_list.size()),
+                rc_param_.max_ref_frames);
+    }
   }
 }
 
@@ -1062,7 +1081,8 @@ TEST_F(RateControlQModeTest, GetGopEncodeInfoWrongGopSize) {
       5, CreateToyTplFrameStatsWithDiffSizes(8, 8));
   AV1RateControlQMode rc;
   const Status status =
-      rc.GetGopEncodeInfo(gop_struct, tpl_gop_stats, RefFrameTable()).status();
+      rc.GetGopEncodeInfo(gop_struct, tpl_gop_stats, {}, {}, RefFrameTable())
+          .status();
   EXPECT_EQ(status.code, AOM_CODEC_INVALID_PARAM);
   EXPECT_THAT(status.message,
               HasSubstr("Frame count of GopStruct (7) doesn't match frame "
@@ -1081,12 +1101,13 @@ TEST_F(RateControlQModeTest, GetGopEncodeInfoRefFrameMissingBlockStats) {
 
   // Only frame 0 has TPL block stats.
   TplGopStats tpl_gop_stats;
-  tpl_gop_stats.frame_stats_list.assign(3, { 8, 176, 144, {} });
+  tpl_gop_stats.frame_stats_list.assign(3, { 8, 176, 144, false, {}, {} });
   tpl_gop_stats.frame_stats_list[0] = CreateToyTplFrameStatsWithDiffSizes(8, 8);
 
   AV1RateControlQMode rc;
   const Status status =
-      rc.GetGopEncodeInfo(gop_struct, tpl_gop_stats, RefFrameTable()).status();
+      rc.GetGopEncodeInfo(gop_struct, tpl_gop_stats, {}, {}, RefFrameTable())
+          .status();
   EXPECT_EQ(status.code, AOM_CODEC_INVALID_PARAM);
   EXPECT_THAT(status.message,
               HasSubstr("The frame with global_coding_idx 2 is a reference "
@@ -1109,6 +1130,47 @@ TEST_F(RateControlQModeTest, TestMock) {
   EXPECT_EQ(result.status().message, "message");
 }
 
+TEST_F(RateControlQModeTest, TestKMeans) {
+  // The distance between intended centroids is designed so each cluster is far
+  // enough from others.
+  std::vector<int> centroids_ref = { 16, 48, 80, 112, 144, 176, 208, 240 };
+  std::vector<uint8_t> random_input;
+  const int num_sample_per_cluster = 10;
+  const int num_clusters = 8;
+  std::default_random_engine generator;
+  for (const int centroid : centroids_ref) {
+    // This is to make sure each cluster is far enough from others.
+    std::uniform_int_distribution<int> distribution(centroid - 8, centroid + 8);
+    for (int i = 0; i < num_sample_per_cluster; ++i) {
+      const int random_sample = distribution(generator);
+      random_input.push_back(static_cast<uint8_t>(random_sample));
+    }
+  }
+  std::shuffle(random_input.begin(), random_input.end(), generator);
+  std::unordered_map<int, int> kmeans_result =
+      aom::internal::KMeans(random_input, num_clusters);
+
+  std::unordered_set<int> found_centroids;
+  for (const auto &result : kmeans_result) {
+    found_centroids.insert(result.second);
+  }
+  // Verify there're num_clusters in the k-means result.
+  EXPECT_EQ(static_cast<int>(found_centroids.size()), num_clusters);
+
+  // Verify that for each data point, the assigned centroid is the closest one.
+  for (const auto &result : kmeans_result) {
+    const int distance_from_cluster_centroid =
+        abs(result.first - result.second);
+    for (const int centroid : found_centroids) {
+      if (centroid == result.second) continue;
+      const int distance_from_other_cluster_centroid =
+          abs(result.first - centroid);
+      EXPECT_LE(distance_from_cluster_centroid,
+                distance_from_other_cluster_centroid);
+    }
+  }
+}
+
 }  // namespace aom
 
 int main(int argc, char **argv) {
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 13b444c46..7910b417b 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -18,11 +18,12 @@
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
+#include "test/i420_video_source.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
-constexpr size_t kNumFrames = 250;
+constexpr size_t kNumFrames = 450;
 
 constexpr int kTemporalId[4] = { 0, 2, 1, 2 };
 
@@ -46,6 +47,7 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
 
   void PreEncodeFrameHook(libaom_test::VideoSource *video,
                           libaom_test::Encoder *encoder) override {
+    int key_int = key_interval_;
     const int use_svc =
         rc_cfg_.ss_number_layers > 1 || rc_cfg_.ts_number_layers > 1;
     encoder->Control(AV1E_SET_RTC_EXTERNAL_RC, 1);
@@ -53,6 +55,8 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
       encoder->Control(AOME_SET_CPUUSED, 7);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT,
+                       rc_cfg_.max_intra_bitrate_pct);
       if (use_svc) encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
     }
     // SVC specific settings
@@ -63,15 +67,10 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
       layer_id_.spatial_layer_id = frame_params_.spatial_layer_id;
       layer_id_.temporal_layer_id = frame_params_.temporal_layer_id;
       encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
+      key_int = key_interval_ * rc_cfg_.ss_number_layers;
     }
-    frame_params_.frame_type = layer_frame_cnt_ % key_interval_ == 0
-                                   ? aom::kKeyFrame
-                                   : aom::kInterFrame;
-    if (!use_svc && frame_params_.frame_type == aom::kInterFrame) {
-      // Disable golden frame update.
-      frame_flags_ |= AOM_EFLAG_NO_UPD_GF;
-      frame_flags_ |= AOM_EFLAG_NO_UPD_ARF;
-    }
+    frame_params_.frame_type =
+        layer_frame_cnt_ % key_int == 0 ? aom::kKeyFrame : aom::kInterFrame;
     encoder_exit_ = video->frame() == kNumFrames;
   }
 
@@ -99,33 +98,61 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
   }
 
   void RunOneLayer() {
+    key_interval_ = 10000;
     SetConfig();
     rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
     frame_params_.spatial_layer_id = 0;
     frame_params_.temporal_layer_id = 0;
 
-    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0,
-                                        kNumFrames);
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerPeriodicKey() {
+    key_interval_ = 100;
+    SetConfig();
+    rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, kNumFrames);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
   void RunSvc() {
+    key_interval_ = 10000;
     SetConfigSvc();
     rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
     frame_params_.spatial_layer_id = 0;
     frame_params_.temporal_layer_id = 0;
 
-    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0,
-                                        kNumFrames);
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcPeriodicKey() {
+    key_interval_ = 100;
+    SetConfigSvc();
+    rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, kNumFrames);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
  private:
   void SetConfig() {
-    rc_cfg_.width = 1280;
-    rc_cfg_.height = 720;
+    rc_cfg_.width = 640;
+    rc_cfg_.height = 480;
     rc_cfg_.max_quantizer = 52;
     rc_cfg_.min_quantizer = 2;
     rc_cfg_.target_bandwidth = 1000;
@@ -146,8 +173,8 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
     rc_cfg_.aq_mode = aq_mode_;
 
     // Encoder settings for ground truth.
-    cfg_.g_w = 1280;
-    cfg_.g_h = 720;
+    cfg_.g_w = 640;
+    cfg_.g_h = 480;
     cfg_.rc_undershoot_pct = 50;
     cfg_.rc_overshoot_pct = 50;
     cfg_.rc_buf_initial_sz = 600;
@@ -165,8 +192,8 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
   }
 
   void SetConfigSvc() {
-    rc_cfg_.width = 1280;
-    rc_cfg_.height = 720;
+    rc_cfg_.width = 640;
+    rc_cfg_.height = 480;
     rc_cfg_.max_quantizer = 52;
     rc_cfg_.min_quantizer = 2;
     rc_cfg_.target_bandwidth = 1000;
@@ -211,6 +238,8 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
     }
 
     // Encoder settings for ground truth.
+    cfg_.g_w = 640;
+    cfg_.g_h = 480;
     svc_params_.number_spatial_layers = 3;
     svc_params_.number_temporal_layers = 3;
     cfg_.g_timebase.num = 1;
@@ -269,8 +298,12 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(RcInterfaceTest, OneLayerPeriodicKey) { RunOneLayerPeriodicKey(); }
+
 TEST_P(RcInterfaceTest, Svc) { RunSvc(); }
 
+TEST_P(RcInterfaceTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
 AV1_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3));
 
 }  // namespace
diff --git a/test/resize_test.cc b/test/resize_test.cc
index f91066b5f..d015ff027 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -868,4 +868,114 @@ AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
 AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
                            ::testing::Values(::libaom_test::kRealTime));
 
+// A test that reproduces crbug.com/1393384. In realtime usage mode, encode
+// frames of sizes 202x202, 1x202, and 202x202. ASan should report no memory
+// errors.
+TEST(ResizeSimpleTest, TemporarySmallerFrameSize) {
+  constexpr int kWidth = 202;
+  constexpr int kHeight = 202;
+  // Dummy buffer of zero samples.
+  constexpr size_t kBufferSize =
+      kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+  std::vector<unsigned char> buffer(kBufferSize);
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               buffer.data()));
+  aom_image_t img2;
+  EXPECT_EQ(&img2, aom_img_wrap(&img2, AOM_IMG_FMT_I420, 1, kHeight, 1,
+                                buffer.data()));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+  cfg.g_w = 1;
+  cfg.g_h = kHeight;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img2, 1, 1, 0));
+
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 2, 1, 0));
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// A test that reproduces crbug.com/1410766. In realtime usage mode
+// for SVC with temporal layers, encode frames of sizes 600x600,
+// 600x600, and 100x480. ASan should report no memory errors.
+TEST(ResizeSimpleTest, SmallerFrameSizeSVC) {
+  constexpr int kWidth = 600;
+  constexpr int kHeight = 600;
+  // Dummy buffer of zero samples.
+  constexpr size_t kBufferSize =
+      kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+  std::vector<unsigned char> buffer(kBufferSize);
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               buffer.data()));
+  aom_image_t img2;
+  EXPECT_EQ(&img2,
+            aom_img_wrap(&img2, AOM_IMG_FMT_I420, 100, 480, 1, buffer.data()));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+
+  aom_svc_params_t svc_params = {};
+  aom_svc_layer_id_t layer_id;
+  svc_params.number_spatial_layers = 1;
+  svc_params.framerate_factor[0] = 2;
+  svc_params.framerate_factor[1] = 1;
+  svc_params.number_temporal_layers = 2;
+  // Bitrate allocation L0: 60% L1: 40%
+  svc_params.layer_target_bitrate[0] = 60 * cfg.rc_target_bitrate / 100;
+  svc_params.layer_target_bitrate[1] = cfg.rc_target_bitrate;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params));
+
+  layer_id.spatial_layer_id = 0;
+  layer_id.temporal_layer_id = 0;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  layer_id.temporal_layer_id = 1;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 1, 1, 0));
+
+  cfg.g_w = 100;
+  cfg.g_h = 480;
+  layer_id.temporal_layer_id = 0;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img2, 2, 1, 0));
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
 }  // namespace
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index 79a89fd36..e5cc1637c 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -48,7 +48,7 @@ std::unordered_map<std::string,
                        { "niklas_1280_720_30.y4m",
                          { { 5, { { 0, 34.4 }, { 3, 34.30 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
-                           { 7, { { 0, 33.5 }, { 3, 33.5 } } },
+                           { 7, { { 0, 33.5 }, { 3, 33.48 } } },
                            { 8, { { 0, 33.48 }, { 3, 33.48 } } },
                            { 9, { { 0, 33.4 }, { 3, 33.4 } } },
                            { 10, { { 0, 33.2 }, { 3, 33.2 } } } } },
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 55897f215..98c8f51a1 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -337,6 +337,30 @@ class SADTestBase : public ::testing::Test {
     }
   }
 
+  virtual void SADForSpeedTest(unsigned int *results,
+                               const uint8_t *const *references) {
+    (void)results;
+    (void)references;
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    unsigned int exp_sad[4];
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    while (test_count > 0) {
+      SADForSpeedTest(exp_sad, references);
+      test_count -= 1;
+    }
+    aom_usec_timer_mark(&timer);
+    const int64_t time = aom_usec_timer_elapsed(&timer) / 1000;
+    std::cout << "BLOCK_" << width_ << "X" << height_
+              << ", bit_depth:" << bit_depth_ << ",Time: " << time << "ms"
+              << std::endl;
+  }
+
   int width_, height_, mask_, bd_;
   aom_bit_depth_t bit_depth_;
   static uint8_t *source_data_;
@@ -376,9 +400,14 @@ class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
         source_data_, source_stride_, references, reference_stride_, results));
   }
 
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
+  }
+
   void CheckSADs() {
     unsigned int reference_sad, exp_sad[4];
-
     SADs(exp_sad);
     for (int block = 0; block < 4; ++block) {
       reference_sad = ReferenceSAD(block);
@@ -386,13 +415,36 @@ class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
+};
 
-  void SpeedSAD() {
-    int test_count = 2000000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
+class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                  public SADTestBase {
+ public:
+  SADx3Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
+  }
+
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 3; ++block) {
+      reference_sad = ReferenceSAD(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
 };
@@ -422,13 +474,10 @@ class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
     }
   }
 
-  void SpeedSAD() {
-    int test_count = 2000000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
   }
 };
 
@@ -459,13 +508,11 @@ class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
     }
   }
 
-  void SpeedSAD() {
-    int test_count = 200000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, second_pred_,
+     results);
   }
 };
 #endif  // !CONFIG_REALTIME_ONLY
@@ -492,12 +539,11 @@ class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void SpeedSAD() {
-    int test_count = 20000000;
-    while (test_count > 0) {
-      SAD(0);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references[0], reference_stride_);
+    (void)results;
   }
 };
 
@@ -523,12 +569,11 @@ class SADSkipTest : public ::testing::WithParamInterface<SadMxNParam>,
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void SpeedSAD() {
-    int test_count = 20000000;
-    while (test_count > 0) {
-      SAD(0);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references[0], reference_stride_);
+    (void)results;
   }
 };
 
@@ -613,12 +658,12 @@ class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void SpeedSAD() {
-    int test_count = 20000000;
-    while (test_count > 0) {
-      SAD(0);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references[0], reference_stride_, width_,
+     height_);
+    (void)results;
   }
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADTest);
@@ -711,6 +756,7 @@ TEST_P(SADTest, ShortSrc) {
     FillRandom(source_data_, source_stride_);
     FillRandom(reference_data_, reference_stride_);
     CheckSAD();
+    if (testing::Test::HasFatalFailure()) break;
     test_count -= 1;
   }
   source_stride_ = tmp_stride;
@@ -765,6 +811,7 @@ TEST_P(SADSkipTest, ShortSrc) {
     FillRandom(source_data_, source_stride_);
     FillRandom(reference_data_, reference_stride_);
     CheckSAD();
+    if (testing::Test::HasFatalFailure()) break;
     test_count -= 1;
   }
   source_stride_ = tmp_stride;
@@ -823,6 +870,7 @@ TEST_P(SADavgTest, ShortSrc) {
     FillRandom(reference_data_, reference_stride_);
     FillRandom(second_pred_, width_);
     CheckSAD();
+    if (testing::Test::HasFatalFailure()) break;
     test_count -= 1;
   }
   source_stride_ = tmp_stride;
@@ -900,6 +948,7 @@ TEST_P(DistWtdSADTest, ShortSrc) {
     FillRandom(source_data_, source_stride_);
     FillRandom(reference_data_, reference_stride_);
     CheckSAD();
+    if (testing::Test::HasFatalFailure()) break;
     test_count -= 1;
   }
   source_stride_ = tmp_stride;
@@ -949,11 +998,13 @@ TEST_P(DistWtdSADavgTest, ShortSrc) {
     FillRandom(reference_data_, reference_stride_);
     FillRandom(second_pred_, width_);
     CheckSAD();
+    if (testing::Test::HasFatalFailure()) break;
     test_count -= 1;
   }
   source_stride_ = tmp_stride;
 }
 
+// SADx4
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);
@@ -1035,6 +1086,88 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   SpeedSAD();
 }
 
+// SADx3
+TEST_P(SADx3Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADx3Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx3Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADx3Test, DISABLED_Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+
 // SADSkipx4
 TEST_P(SADSkipx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -1655,6 +1788,108 @@ const SadMxNx4Param x4d_c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadMxNx4Param x3d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x3d_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x3d_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x3d_c, -1),
+  make_tuple(64, 64, &aom_sad64x64x3d_c, -1),
+  make_tuple(64, 32, &aom_sad64x32x3d_c, -1),
+  make_tuple(32, 64, &aom_sad32x64x3d_c, -1),
+  make_tuple(32, 32, &aom_sad32x32x3d_c, -1),
+  make_tuple(32, 16, &aom_sad32x16x3d_c, -1),
+  make_tuple(16, 32, &aom_sad16x32x3d_c, -1),
+  make_tuple(16, 16, &aom_sad16x16x3d_c, -1),
+  make_tuple(16, 8, &aom_sad16x8x3d_c, -1),
+  make_tuple(8, 16, &aom_sad8x16x3d_c, -1),
+  make_tuple(8, 8, &aom_sad8x8x3d_c, -1),
+  make_tuple(8, 4, &aom_sad8x4x3d_c, -1),
+  make_tuple(4, 8, &aom_sad4x8x3d_c, -1),
+  make_tuple(4, 4, &aom_sad4x4x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16x3d_c, -1),
+  make_tuple(16, 64, &aom_sad16x64x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8x3d_c, -1),
+  make_tuple(8, 32, &aom_sad8x32x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4x3d_c, -1),
+  make_tuple(4, 16, &aom_sad4x16x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 12),
+#endif
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx3Test, ::testing::ValuesIn(x3d_c_tests));
+
 const SadMxNx4Param skip_x4d_c_tests[] = {
   make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
   make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
@@ -1790,20 +2025,57 @@ INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests));
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_neon, -1),
+  make_tuple(128, 64, &aom_sad128x64_neon, -1),
+  make_tuple(64, 128, &aom_sad64x128_neon, -1),
   make_tuple(64, 64, &aom_sad64x64_neon, -1),
+  make_tuple(64, 32, &aom_sad64x32_neon, -1),
+  make_tuple(32, 64, &aom_sad32x64_neon, -1),
   make_tuple(32, 32, &aom_sad32x32_neon, -1),
+  make_tuple(32, 16, &aom_sad32x16_neon, -1),
+  make_tuple(16, 32, &aom_sad16x32_neon, -1),
   make_tuple(16, 16, &aom_sad16x16_neon, -1),
   make_tuple(16, 8, &aom_sad16x8_neon, -1),
   make_tuple(8, 16, &aom_sad8x16_neon, -1),
   make_tuple(8, 8, &aom_sad8x8_neon, -1),
+  make_tuple(8, 4, &aom_sad8x4_neon, -1),
+  make_tuple(4, 8, &aom_sad4x8_neon, -1),
   make_tuple(4, 4, &aom_sad4x4_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16_neon, -1),
+  make_tuple(32, 8, &aom_sad32x8_neon, -1),
+  make_tuple(16, 64, &aom_sad16x64_neon, -1),
+  make_tuple(16, 4, &aom_sad16x4_neon, -1),
+  make_tuple(8, 32, &aom_sad8x32_neon, -1),
+  make_tuple(4, 16, &aom_sad4x16_neon, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
 const SadMxNx4Param x4d_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_neon, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_neon, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_neon, -1),
   make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_neon, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_neon, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_neon, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_neon, -1),
   make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_neon, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_neon, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_neon, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_neon, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_neon, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16x4d_neon, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_neon, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_neon, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_neon, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_neon, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_neon, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 const SadSkipMxNParam skip_neon_tests[] = {
@@ -1857,6 +2129,35 @@ const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
                          ::testing::ValuesIn(skip_x4d_neon_tests));
+
+const SadMxNAvgParam avg_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_neon, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_neon, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_neon, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_neon, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_neon, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_neon, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_neon, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_neon, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_neon, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_neon, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_neon, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_neon, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_neon, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_neon, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_neon, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16_avg_neon, -1),
+  make_tuple(32, 8, &aom_sad32x8_avg_neon, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_neon, -1),
+  make_tuple(16, 4, &aom_sad16x4_avg_neon, -1),
+  make_tuple(8, 32, &aom_sad8x32_avg_neon, -1),
+  make_tuple(4, 16, &aom_sad4x16_avg_neon, -1),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -2637,6 +2938,9 @@ const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
   make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
   make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_avx2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_avx2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_avx2, -1),
 
 #if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
@@ -2693,6 +2997,8 @@ const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
 #if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
   make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1),
 #endif
 };
 
@@ -2700,6 +3006,9 @@ INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
                          ::testing::ValuesIn(skip_x4d_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
+  make_tuple(16, 32, &aom_sad16x32x4d_avx2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_avx2, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_avx2, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
@@ -2710,6 +3019,8 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
 
 #if !CONFIG_REALTIME_ONLY
+  make_tuple(16, 64, &aom_sad16x64x4d_avx2, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_avx2, -1),
   make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
   make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
 #endif
@@ -2766,61 +3077,74 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 #endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
-#endif  // HAVE_AVX2
 
-//------------------------------------------------------------------------------
-// MIPS functions
-#if HAVE_MSA
-const SadMxNParam msa_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64_msa, -1),
-  make_tuple(64, 32, &aom_sad64x32_msa, -1),
-  make_tuple(32, 64, &aom_sad32x64_msa, -1),
-  make_tuple(32, 32, &aom_sad32x32_msa, -1),
-  make_tuple(32, 16, &aom_sad32x16_msa, -1),
-  make_tuple(16, 32, &aom_sad16x32_msa, -1),
-  make_tuple(16, 16, &aom_sad16x16_msa, -1),
-  make_tuple(16, 8, &aom_sad16x8_msa, -1),
-  make_tuple(8, 16, &aom_sad8x16_msa, -1),
-  make_tuple(8, 8, &aom_sad8x8_msa, -1),
-  make_tuple(8, 4, &aom_sad8x4_msa, -1),
-  make_tuple(4, 8, &aom_sad4x8_msa, -1),
-  make_tuple(4, 4, &aom_sad4x4_msa, -1),
-};
-INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
-
-const SadMxNAvgParam avg_msa_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64_avg_msa, -1),
-  make_tuple(64, 32, &aom_sad64x32_avg_msa, -1),
-  make_tuple(32, 64, &aom_sad32x64_avg_msa, -1),
-  make_tuple(32, 32, &aom_sad32x32_avg_msa, -1),
-  make_tuple(32, 16, &aom_sad32x16_avg_msa, -1),
-  make_tuple(16, 32, &aom_sad16x32_avg_msa, -1),
-  make_tuple(16, 16, &aom_sad16x16_avg_msa, -1),
-  make_tuple(16, 8, &aom_sad16x8_avg_msa, -1),
-  make_tuple(8, 16, &aom_sad8x16_avg_msa, -1),
-  make_tuple(8, 8, &aom_sad8x8_avg_msa, -1),
-  make_tuple(8, 4, &aom_sad8x4_avg_msa, -1),
-  make_tuple(4, 8, &aom_sad4x8_avg_msa, -1),
-  make_tuple(4, 4, &aom_sad4x4_avg_msa, -1),
-};
-INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
-
-const SadMxNx4Param x4d_msa_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64x4d_msa, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_msa, -1),
-  make_tuple(32, 64, &aom_sad32x64x4d_msa, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_msa, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_msa, -1),
-  make_tuple(16, 32, &aom_sad16x32x4d_msa, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_msa, -1),
-  make_tuple(16, 8, &aom_sad16x8x4d_msa, -1),
-  make_tuple(8, 16, &aom_sad8x16x4d_msa, -1),
-  make_tuple(8, 8, &aom_sad8x8x4d_msa, -1),
-  make_tuple(8, 4, &aom_sad8x4x4d_msa, -1),
-  make_tuple(4, 8, &aom_sad4x8x4d_msa, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_msa, -1),
+const SadMxNx4Param x3d_avx2_tests[] = {
+  make_tuple(32, 64, &aom_sad32x64x3d_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32x3d_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16x3d_avx2, -1),
+  make_tuple(64, 128, &aom_sad64x128x3d_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64x3d_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32x3d_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128x3d_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64x3d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(32, 8, &aom_sad32x8x3d_avx2, -1),
+  make_tuple(64, 16, &aom_sad64x16x3d_avx2, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 12),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 12),
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
-#endif  // HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx3Test, ::testing::ValuesIn(x3d_avx2_tests));
+#endif  // HAVE_AVX2
 
 }  // namespace
diff --git a/test/sb_qp_sweep_test.cc b/test/sb_qp_sweep_test.cc
new file mode 100644
index 000000000..6c76a40b2
--- /dev/null
+++ b/test/sb_qp_sweep_test.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Parameters: cpu-used, row-mt.
+class AV1SBQPSweepTest : public ::libaom_test::CodecTestWith2Params<int, bool>,
+                         public ::libaom_test::EncoderTest {
+ protected:
+  AV1SBQPSweepTest()
+      : EncoderTest(GET_PARAM(0)), set_cpu_used_(GET_PARAM(1)),
+        row_mt_(GET_PARAM(2)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    cfg.allow_lowbitdepth = 1;
+    decoder_ =
+        std::unique_ptr<::libaom_test::Decoder>(codec_->CreateDecoder(cfg, 0));
+  }
+  ~AV1SBQPSweepTest() override = default;
+
+  void SetUp() override {
+    InitializeConfig(::libaom_test::kTwoPassGood);
+
+    ASSERT_NE(decoder_, nullptr);
+    if (decoder_->IsAV1()) {
+      decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_Q;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      SetTileSize(encoder);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_ENABLE_SB_QP_SWEEP, use_sb_sweep_);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+    encoder->Control(AV1E_SET_TILE_ROWS, 1);
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetAverageFrameSize() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+    sum_frame_size_ += pkt->data.frame.sz;
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+  }
+
+  void DoTest() {
+    ::libaom_test::YUVVideoSource video(
+        "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 0, 6);
+    cfg_.rc_target_bitrate = 1000;
+
+    // Encode without sb_qp_sweep
+    use_sb_sweep_ = false;
+    sum_frame_size_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    const double psnr_1 = GetAveragePsnr();
+    const size_t avg_frame_size_1 = sum_frame_size_ / nframes_;
+
+    // Encode with sb_qp_sweep
+    use_sb_sweep_ = true;
+    sum_frame_size_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    const double psnr_2 = GetAveragePsnr();
+    const size_t avg_frame_size_2 = sum_frame_size_ / nframes_;
+
+    if (psnr_1 >= psnr_2) {
+      ASSERT_GE(avg_frame_size_1, avg_frame_size_2);
+    }
+    if (avg_frame_size_1 <= avg_frame_size_2) {
+      ASSERT_LE(psnr_1, psnr_2);
+    }
+  }
+
+  bool use_sb_sweep_;
+  int set_cpu_used_;
+  bool row_mt_;
+  double psnr_;
+  unsigned int nframes_;
+  size_t sum_frame_size_;
+  std::unique_ptr<::libaom_test::Decoder> decoder_;
+};
+
+TEST_P(AV1SBQPSweepTest, SweepMatchTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(AV1SBQPSweepTest, ::testing::Range(4, 6),
+                           ::testing::Bool());
+
+}  // namespace
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
index e7c32e627..68355ece5 100644
--- a/test/sse_sum_test.cc
+++ b/test/sse_sum_test.cc
@@ -160,6 +160,13 @@ INSTANTIATE_TEST_SUITE_P(SSE2, SumSSETest,
                              &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_sse2)));
 
 #endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SumSSETest,
+                         ::testing::Values(TestFuncs(
+                             &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_neon)));
+#endif  // HAVE_NEON
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, SumSSETest,
                          ::testing::Values(TestFuncs(
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 59dd21831..4003e51dc 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -31,124 +31,65 @@ typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
 
 namespace {
 
-class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
- public:
-  virtual void TearDown() {}
-};
+using std::get;
+using std::make_tuple;
+using std::tuple;
 
 using libaom_test::ACMRandom;
 
-TEST_P(AV1SubtractBlockTest, SimpleSubtract) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
-       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
-    const int block_width = block_size_wide[bsize];
-    const int block_height = block_size_high[bsize];
-    int16_t *diff = reinterpret_cast<int16_t *>(
-        aom_memalign(16, sizeof(*diff) * block_width * block_height * 2));
-    ASSERT_NE(diff, nullptr);
-    uint8_t *pred = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, block_width * block_height * 2));
-    ASSERT_NE(pred, nullptr);
-    uint8_t *src = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, block_width * block_height * 2));
-    ASSERT_NE(src, nullptr);
-
-    for (int n = 0; n < 100; n++) {
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width * 2; ++c) {
-          src[r * block_width * 2 + c] = rnd.Rand8();
-          pred[r * block_width * 2 + c] = rnd.Rand8();
-        }
-      }
-
-      GetParam()(block_height, block_width, diff, block_width, src, block_width,
-                 pred, block_width);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(diff[r * block_width + c],
-                    (src[r * block_width + c] - pred[r * block_width + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
-        }
-      }
+// <BLOCK_SIZE, optimized subtract func, reference subtract func>
+using Params = tuple<BLOCK_SIZE, SubtractFunc, SubtractFunc>;
 
-      GetParam()(block_height, block_width, diff, block_width * 2, src,
-                 block_width * 2, pred, block_width * 2);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(
-              diff[r * block_width * 2 + c],
-              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
-        }
-      }
+class AV1SubtractBlockTestBase : public ::testing::Test {
+ public:
+  AV1SubtractBlockTestBase(BLOCK_SIZE bs, int bit_depth, SubtractFunc func,
+                           SubtractFunc ref_func) {
+    block_width_ = block_size_wide[bs];
+    block_height_ = block_size_high[bs];
+    func_ = func;
+    ref_func_ = ref_func;
+    if (bit_depth == -1) {
+      hbd_ = 0;
+      bit_depth_ = AOM_BITS_8;
+    } else {
+      hbd_ = 1;
+      bit_depth_ = static_cast<aom_bit_depth_t>(bit_depth);
     }
-    aom_free(diff);
-    aom_free(pred);
-    aom_free(src);
   }
-}
-
-INSTANTIATE_TEST_SUITE_P(C, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_sse2));
-#endif
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_neon));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_SUITE_P(MSA, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_msa));
-#endif
-
-#if CONFIG_AV1_HIGHBITDEPTH
-typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
-                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                                ptrdiff_t pred_stride);
 
-using std::get;
-using std::make_tuple;
-using std::tuple;
-
-// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
-typedef tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc> Params;
-
-class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
- public:
   virtual void SetUp() {
-    block_width_ = block_size_wide[GET_PARAM(0)];
-    block_height_ = block_size_high[GET_PARAM(0)];
-    bit_depth_ = static_cast<aom_bit_depth_t>(GET_PARAM(1));
-    func_ = GET_PARAM(2);
-    ref_func_ = GET_PARAM(3);
-
     rnd_.Reset(ACMRandom::DeterministicSeed());
 
     const size_t max_width = 128;
     const size_t max_block_size = max_width * max_width;
-    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-        aom_memalign(16, max_block_size * sizeof(uint16_t))));
-    ASSERT_NE(src_, nullptr);
-    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-        aom_memalign(16, max_block_size * sizeof(uint16_t))));
-    ASSERT_NE(pred_, nullptr);
+    if (hbd_) {
+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          aom_memalign(16, max_block_size * sizeof(uint16_t))));
+      ASSERT_NE(src_, nullptr);
+      pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          aom_memalign(16, max_block_size * sizeof(uint16_t))));
+      ASSERT_NE(pred_, nullptr);
+    } else {
+      src_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(16, max_block_size * sizeof(uint8_t)));
+      ASSERT_NE(src_, nullptr);
+      pred_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(16, max_block_size * sizeof(uint8_t)));
+      ASSERT_NE(pred_, nullptr);
+    }
     diff_ = reinterpret_cast<int16_t *>(
-        aom_memalign(16, max_block_size * sizeof(int16_t)));
+        aom_memalign(32, max_block_size * sizeof(int16_t)));
     ASSERT_NE(diff_, nullptr);
   }
 
   virtual void TearDown() {
-    aom_free(CONVERT_TO_SHORTPTR(src_));
-    aom_free(CONVERT_TO_SHORTPTR(pred_));
+    if (hbd_) {
+      aom_free(CONVERT_TO_SHORTPTR(src_));
+      aom_free(CONVERT_TO_SHORTPTR(pred_));
+    } else {
+      aom_free(src_);
+      aom_free(pred_);
+    }
     aom_free(diff_);
   }
 
@@ -157,60 +98,78 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
   void RunForSpeed();
 
  private:
+  void FillInputs();
+
   ACMRandom rnd_;
   int block_height_;
   int block_width_;
+  bool hbd_;
   aom_bit_depth_t bit_depth_;
-  HBDSubtractFunc func_;
-  HBDSubtractFunc ref_func_;
+  SubtractFunc func_;
+  SubtractFunc ref_func_;
   uint8_t *src_;
   uint8_t *pred_;
   int16_t *diff_;
 };
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HBDSubtractBlockTest);
 
-void AV1HBDSubtractBlockTest::CheckResult() {
-  const int test_num = 100;
+void AV1SubtractBlockTestBase::FillInputs() {
   const size_t max_width = 128;
   const int max_block_size = max_width * max_width;
-  const int mask = (1 << bit_depth_) - 1;
-  int i, j;
+  if (hbd_) {
+    const int mask = (1 << bit_depth_) - 1;
+    for (int i = 0; i < max_block_size; ++i) {
+      CONVERT_TO_SHORTPTR(src_)[i] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[i] = rnd_.Rand16() & mask;
+    }
+  } else {
+    if (src_ == nullptr) {
+      std::cerr << "gadfg" << std::endl;
+    }
+    for (int i = 0; i < max_block_size; ++i) {
+      src_[i] = rnd_.Rand8();
+      pred_[i] = rnd_.Rand8();
+    }
+  }
+}
+
+void AV1SubtractBlockTestBase::CheckResult() {
+  const int test_num = 100;
+  int i;
 
   for (i = 0; i < test_num; ++i) {
-    for (j = 0; j < max_block_size; ++j) {
-      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
-      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
-    }
+    FillInputs();
 
     func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
           pred_, block_width_);
 
-    for (int r = 0; r < block_height_; ++r) {
-      for (int c = 0; c < block_width_; ++c) {
-        EXPECT_EQ(diff_[r * block_width_ + c],
-                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
-                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
-            << "r = " << r << ", c = " << c << ", test: " << i;
+    if (hbd_)
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_; ++c) {
+          EXPECT_EQ(diff_[r * block_width_ + c],
+                    (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                     CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+              << "r = " << r << ", c = " << c << ", test: " << i;
+        }
+      }
+    else {
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_; ++c) {
+          EXPECT_EQ(diff_[r * block_width_ + c],
+                    src_[r * block_width_ + c] - pred_[r * block_width_ + c])
+              << "r = " << r << ", c = " << c << ", test: " << i;
+        }
       }
     }
   }
 }
 
-TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); }
-
-void AV1HBDSubtractBlockTest::RunForSpeed() {
+void AV1SubtractBlockTestBase::RunForSpeed() {
   const int test_num = 200000;
-  const size_t max_width = 128;
-  const int max_block_size = max_width * max_width;
-  const int mask = (1 << bit_depth_) - 1;
-  int i, j;
+  int i;
 
   if (ref_func_ == func_) GTEST_SKIP();
 
-  for (j = 0; j < max_block_size; ++j) {
-    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
-    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
-  }
+  FillInputs();
 
   aom_usec_timer ref_timer;
   aom_usec_timer_start(&ref_timer);
@@ -221,10 +180,7 @@ void AV1HBDSubtractBlockTest::RunForSpeed() {
   aom_usec_timer_mark(&ref_timer);
   const int64_t ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
 
-  for (j = 0; j < max_block_size; ++j) {
-    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
-    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
-  }
+  FillInputs();
 
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
@@ -245,7 +201,17 @@ void AV1HBDSubtractBlockTest::RunForSpeed() {
           static_cast<double>(elapsed_time));
 }
 
-TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+class AV1SubtractBlockTest : public ::testing::WithParamInterface<Params>,
+                             public AV1SubtractBlockTestBase {
+ public:
+  AV1SubtractBlockTest()
+      : AV1SubtractBlockTestBase(GET_PARAM(0), -1, GET_PARAM(1), GET_PARAM(2)) {
+  }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1SubtractBlockTest);
+
+TEST_P(AV1SubtractBlockTest, CheckResult) { CheckResult(); }
+TEST_P(AV1SubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
 
 const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,    BLOCK_4X8,    BLOCK_8X4,
                                        BLOCK_8X8,    BLOCK_8X16,   BLOCK_16X8,
@@ -255,6 +221,50 @@ const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,    BLOCK_4X8,    BLOCK_8X4,
                                        BLOCK_128X128 };
 
 INSTANTIATE_TEST_SUITE_P(
+    C, AV1SubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(&aom_subtract_block_c),
+                       ::testing::Values(&aom_subtract_block_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1SubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(&aom_subtract_block_sse2),
+                       ::testing::Values(&aom_subtract_block_c)));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1SubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(&aom_subtract_block_avx2),
+                       ::testing::Values(&aom_subtract_block_c)));
+
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1SubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(&aom_subtract_block_neon),
+                       ::testing::Values(&aom_subtract_block_c)));
+
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using ParamsHBD = tuple<BLOCK_SIZE, int, SubtractFunc, SubtractFunc>;
+
+class AV1HBDSubtractBlockTest : public ::testing::WithParamInterface<ParamsHBD>,
+                                public AV1SubtractBlockTestBase {
+ public:
+  AV1HBDSubtractBlockTest()
+      : AV1SubtractBlockTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2),
+                                 GET_PARAM(3)) {}
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HBDSubtractBlockTest);
+
+INSTANTIATE_TEST_SUITE_P(
     C, AV1HBDSubtractBlockTest,
     ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
                        ::testing::Values(12),
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 770777c6a..a5c38401b 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -90,6 +90,8 @@ class DatarateTestSVC
     frame_sync_ = 0;
     current_video_frame_ = 0;
     screen_mode_ = 0;
+    rps_mode_ = 0;
+    rps_recovery_frame_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -122,16 +124,17 @@ class DatarateTestSVC
     if (number_spatial_layers_ == 2) {
       spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1;
     } else if (number_spatial_layers_ == 3) {
-      spatial_layer_id = (layer_frame_cnt_ % 3 == 0)
-                             ? 0
-                             : ((layer_frame_cnt_ - 1) % 3 == 0) ? 1 : 2;
+      spatial_layer_id = (layer_frame_cnt_ % 3 == 0)         ? 0
+                         : ((layer_frame_cnt_ - 1) % 3 == 0) ? 1
+                                                             : 2;
     }
     // Set the reference/update flags, layer_id, and reference_map
     // buffer index.
     frame_flags_ = set_layer_pattern(
         video->frame(), &layer_id_, &ref_frame_config_, &ref_frame_comp_pred_,
         spatial_layer_id, multi_ref_, comp_pred_,
-        (video->frame() % cfg_.kf_max_dist) == 0, dynamic_enable_disable_mode_);
+        (video->frame() % cfg_.kf_max_dist) == 0, dynamic_enable_disable_mode_,
+        rps_mode_, rps_recovery_frame_);
     if (intra_only_ == 1 && frame_sync_ > 0) {
       // Set an Intra-only frame on SL0 at frame_sync_.
       // In order to allow decoding to start on SL0 in mid-sequence we need to
@@ -213,7 +216,7 @@ class DatarateTestSVC
     }
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     duration_ = ((last_pts_ + 1) * timebase_);
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_;
@@ -249,13 +252,66 @@ class DatarateTestSVC
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
   unsigned int GetDecodedFrames() { return decoded_nframes_; }
 
+  static void ref_config_rps(aom_svc_ref_frame_config_t *ref_frame_config,
+                             int frame_cnt, int rps_recovery_frame) {
+    // Pattern of 3 references with (ALTREF and GOLDEN) trailing
+    // LAST by 4 and 8 frame, with some switching logic to
+    // only predict from longer-term reference.
+    int last_idx = 0;
+    int last_idx_refresh = 0;
+    int gld_idx = 0;
+    int alt_ref_idx = 0;
+    const int lag_alt = 4;
+    const int lag_gld = 8;
+    const int sh = 8;  // slots 0 - 7.
+    // Moving index slot for last: 0 - (sh - 1)
+    if (frame_cnt > 1) last_idx = (frame_cnt - 1) % sh;
+    // Moving index for refresh of last: one ahead for next frame.
+    last_idx_refresh = frame_cnt % sh;
+    // Moving index for gld_ref, lag behind current by lag_gld
+    if (frame_cnt > lag_gld) gld_idx = (frame_cnt - lag_gld) % sh;
+    // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+    if (frame_cnt > lag_alt) alt_ref_idx = (frame_cnt - lag_alt) % sh;
+    // Set the ref_idx.
+    // Default all references (7) to slot for last.
+    // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+    // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+    for (int i = 0; i < INTER_REFS_PER_FRAME; i++)
+      ref_frame_config->ref_idx[i] = last_idx;
+    // Set the ref_idx for the relevant references.
+    ref_frame_config->ref_idx[0] = last_idx;
+    ref_frame_config->ref_idx[1] = last_idx_refresh;
+    ref_frame_config->ref_idx[3] = gld_idx;
+    ref_frame_config->ref_idx[6] = alt_ref_idx;
+    // Refresh this slot, which will become LAST on next frame.
+    ref_frame_config->refresh[last_idx_refresh] = 1;
+    // Reference LAST, ALTREF, and GOLDEN
+    ref_frame_config->reference[0] = 1;
+    ref_frame_config->reference[6] = 1;
+    ref_frame_config->reference[3] = 1;
+    if (frame_cnt == rps_recovery_frame) {
+      // Switch to only reference GOLDEN at recovery_frame.
+      ref_frame_config->reference[0] = 0;
+      ref_frame_config->reference[6] = 0;
+      ref_frame_config->reference[3] = 1;
+    } else if (frame_cnt > rps_recovery_frame &&
+               frame_cnt < rps_recovery_frame + 8) {
+      // Go back to predicting from LAST, and after
+      // 8 frames (GOLDEN is 8 frames aways) go back
+      // to predicting off GOLDEN and ALTREF.
+      ref_frame_config->reference[0] = 1;
+      ref_frame_config->reference[6] = 0;
+      ref_frame_config->reference[3] = 0;
+    }
+  }
+
   // Layer pattern configuration.
   virtual int set_layer_pattern(
       int frame_cnt, aom_svc_layer_id_t *layer_id,
       aom_svc_ref_frame_config_t *ref_frame_config,
       aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int spatial_layer,
       int multi_ref, int comp_pred, int is_key_frame,
-      int dynamic_enable_disable_mode) {
+      int dynamic_enable_disable_mode, int rps_mode, int rps_recovery_frame) {
     int lag_index = 0;
     int base_count = frame_cnt >> 2;
     layer_id->spatial_layer_id = spatial_layer;
@@ -278,6 +334,8 @@ class DatarateTestSVC
     ref_frame_config->reference[0] = 1;
     if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) {
       ref_frame_config->refresh[0] = 1;
+      if (rps_mode)
+        ref_config_rps(ref_frame_config, frame_cnt, rps_recovery_frame);
     }
     if (number_temporal_layers_ == 3 && number_spatial_layers_ == 1) {
       // 3-layer:
@@ -612,8 +670,43 @@ class DatarateTestSVC
     // mismatch count, since loopfilter/cdef is not applied for these on
     // encoder side, but is always applied on decoder.
     // This means 30 = #frames(60) - #TL2_frames(30).
-    EXPECT_EQ((int)GetMismatchFrames(), 30);
+    // We use LE for screen since loopfilter level can become very small
+    // or zero and then the frame is not a mismatch.
+    EXPECT_LE((int)GetMismatchFrames(), 30);
   }
+
+  virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+    const int bitrate_array[2] = { 800, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    screen_mode_ = 1;
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 30 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 60 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.5)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
   virtual void BasicRateTargetingSVC1TL1SLScreenScCutsMotionTest() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -810,7 +903,7 @@ class DatarateTestSVC
     target_layer_bitrate_[8] = bitrate_sl2;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
-      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.585)
           << " The datarate for the file is lower than target by too much!";
       ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
@@ -1694,6 +1787,51 @@ class DatarateTestSVC
     }
   }
 
+  virtual void BasicRateTargetingRPS1TL1SLDropFramesTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 100, 300 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    rps_mode_ = 1;
+    rps_recovery_frame_ = 100;
+    cfg_.g_error_resilient = 0;
+    // Drop x frames before the recovery frames (where the reference
+    // is switched to an older reference (golden or altref).
+    // GOLDEN is 8 frames behind (for the rps pattern example) so we can't
+    // drop more than 8 frames recovery frame, so choose x = 7.
+    int n = 0;
+    for (int i = rps_recovery_frame_ - 7; i < rps_recovery_frame_; i++) {
+      drop_frames_list_[n] = i;
+      n++;
+    }
+    drop_frames_ = n;
+    number_spatial_layers_ = 1;
+    number_temporal_layers_ = 1;
+    target_layer_bitrate_[0] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
   int layer_frame_cnt_;
   int superframe_cnt_;
   int number_temporal_layers_;
@@ -1721,6 +1859,8 @@ class DatarateTestSVC
   unsigned int frame_sync_;
   unsigned int current_video_frame_;
   int screen_mode_;
+  int rps_mode_;
+  int rps_recovery_frame_;
 };
 
 // Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
@@ -1734,6 +1874,12 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLScreen) {
   BasicRateTargetingSVC3TL1SLScreenTest();
 }
 
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
+// for screen mode.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
+  BasicRateTargetingSVC1TL3SLScreenTest();
+}
+
 // Check basic rate targeting for CBR, for 1 temporal layer, 1 spatial
 // for screen mode, with source with many scene cuts and motion.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL1SLScreenScCutsMotion) {
@@ -1912,6 +2058,14 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLDynDisEnabl) {
   BasicRateTargetingSVC1TL3SLDynDisEnablTest();
 }
 
+// Check basic rate targeting and encoder/decodermismatch, for RPS
+// with 1 layer. A number of consecutive frames are lost midway in
+// sequence, and encoder resorts to a longer term reference to recovery
+// and continue decoding successfully.
+TEST_P(DatarateTestSVC, BasicRateTargetingRPS1TL1SLDropFrames) {
+  BasicRateTargetingRPS1TL1SLDropFramesTest();
+}
+
 AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
                            ::testing::Values(::libaom_test::kRealTime),
                            ::testing::Range(7, 11), ::testing::Values(0, 3),
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index bf61f02cf..154fd5dfd 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -296,6 +296,15 @@ INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
                          Combine(ValuesIn(temporal_filter_test_sse2),
                                  Range(64, 65, 4)));
 #endif  // HAVE_SSE2
+
+#if HAVE_NEON
+TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_neon),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_NEON
+
 #if CONFIG_AV1_HIGHBITDEPTH
 
 typedef void (*HBDTemporalFilterFunc)(
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 16673ab42..7e4abaee5 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -22,7 +22,7 @@ b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res
 cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf
 758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res
 88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf
-64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2
+1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res.3
 0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res
 7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf
@@ -566,3 +566,6 @@ e71cd9a07f928c527c900daddd071ae60337426d *av1-1-b8-24-monochrome.ivf.md5
 e24aa6951afd7b2bb53eb1a73e25a19e7b189f82 *av1-1-b10-24-monochrome.ivf.md5
 df0c9481104aa8c81f9e3b61b6d147a331ad3e35 *firstpass_stats
 3eaf216d9fc8b4b9bb8c3956311f49a85974806c *bus_352x288_420_f20_b8.yuv
+c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf
+36a4fcf07e645ed522cde5845dd9c6ab2b2d1502 *av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf.md5
+9f935d391fdf4a6f7c320355d45770d2e7d6095c *desktopqvga2.320_240.yuv
diff --git a/test/test.cmake b/test/test.cmake
index 9ad832e49..a173246b4 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -53,17 +53,21 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
             "${AOM_ROOT}/test/active_map_test.cc"
             "${AOM_ROOT}/test/aq_segment_test.cc"
             "${AOM_ROOT}/test/av1_external_partition_test.cc"
+            "${AOM_ROOT}/test/avif_progressive_test.cc"
             "${AOM_ROOT}/test/borders_test.cc"
             "${AOM_ROOT}/test/cpu_speed_test.cc"
             "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
             "${AOM_ROOT}/test/datarate_test.cc"
             "${AOM_ROOT}/test/datarate_test.h"
+            "${AOM_ROOT}/test/deltaq_mode_test.cc"
             "${AOM_ROOT}/test/svc_datarate_test.cc"
             "${AOM_ROOT}/test/encode_api_test.cc"
             "${AOM_ROOT}/test/encode_small_width_height_test.cc"
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
             "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
+            "${AOM_ROOT}/test/forced_max_frame_width_height_test.cc"
+            "${AOM_ROOT}/test/force_key_frame_test.cc"
             "${AOM_ROOT}/test/gf_pyr_height_test.cc"
             "${AOM_ROOT}/test/rt_end_to_end_test.cc"
             "${AOM_ROOT}/test/allintra_end_to_end_test.cc"
@@ -74,6 +78,7 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
             "${AOM_ROOT}/test/level_test.cc"
             "${AOM_ROOT}/test/metadata_test.cc"
             "${AOM_ROOT}/test/monochrome_test.cc"
+            "${AOM_ROOT}/test/postproc_filters_test.cc"
             "${AOM_ROOT}/test/resize_test.cc"
             "${AOM_ROOT}/test/scalability_test.cc"
             "${AOM_ROOT}/test/sharpness_test.cc"
@@ -104,15 +109,19 @@ if(CONFIG_REALTIME_ONLY)
   list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
                    "${AOM_ROOT}/test/allintra_end_to_end_test.cc"
                    "${AOM_ROOT}/test/av1_external_partition_test.cc"
+                   "${AOM_ROOT}/test/avif_progressive_test.cc"
                    "${AOM_ROOT}/test/borders_test.cc"
                    "${AOM_ROOT}/test/cpu_speed_test.cc"
                    "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
+                   "${AOM_ROOT}/test/deltaq_mode_test.cc"
                    "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
+                   "${AOM_ROOT}/test/force_key_frame_test.cc"
                    "${AOM_ROOT}/test/gf_pyr_height_test.cc"
                    "${AOM_ROOT}/test/horz_superres_test.cc"
                    "${AOM_ROOT}/test/level_test.cc"
                    "${AOM_ROOT}/test/metadata_test.cc"
                    "${AOM_ROOT}/test/monochrome_test.cc"
+                   "${AOM_ROOT}/test/postproc_filters_test.cc"
                    "${AOM_ROOT}/test/sharpness_test.cc")
 endif()
 
@@ -199,9 +208,13 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/av1_k_means_test.cc")
 
   list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
-              "${AOM_ROOT}/test/corner_match_test.cc"
               "${AOM_ROOT}/test/simd_cmp_sse4.cc")
 
+  if(NOT CONFIG_REALTIME_ONLY)
+    list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+                "${AOM_ROOT}/test/corner_match_test.cc")
+  endif()
+
   if(CONFIG_ACCOUNTING)
     list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
                 "${AOM_ROOT}/test/accounting_test.cc")
@@ -228,6 +241,7 @@ if(NOT BUILD_SHARED_LIBS)
                 "${AOM_ROOT}/test/ratectrl_test.cc"
                 "${AOM_ROOT}/test/rd_test.cc"
                 "${AOM_ROOT}/test/sb_multipass_test.cc"
+                "${AOM_ROOT}/test/sb_qp_sweep_test.cc"
                 "${AOM_ROOT}/test/screen_content_test.cc"
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
                 "${AOM_ROOT}/test/still_picture_test.cc"
@@ -250,6 +264,7 @@ if(NOT BUILD_SHARED_LIBS)
                        "${AOM_ROOT}/test/kf_test.cc"
                        "${AOM_ROOT}/test/lossless_test.cc"
                        "${AOM_ROOT}/test/sb_multipass_test.cc"
+                       "${AOM_ROOT}/test/sb_qp_sweep_test.cc"
                        "${AOM_ROOT}/test/selfguided_filter_test.cc"
                        "${AOM_ROOT}/test/screen_content_test.cc"
                        "${AOM_ROOT}/test/still_picture_test.cc"
@@ -319,7 +334,7 @@ if(NOT BUILD_SHARED_LIBS)
 
   endif()
 
-  if(HAVE_SSE4_2)
+  if(HAVE_SSE4_2 OR HAVE_ARM_CRC32)
     list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
   endif()
 
@@ -495,6 +510,10 @@ function(setup_aom_test_targets)
     add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
   endif()
+  if(HAVE_ARM_CRC32)
+    add_intrinsics_source_to_target("${AOM_ARM_CRC32_FLAG}" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_CRC32")
+  endif()
 
   if(ENABLE_TESTDATA)
     make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
diff --git a/test/test_data_util.cmake b/test/test_data_util.cmake
index 74154b6ec..9a6accb30 100644
--- a/test/test_data_util.cmake
+++ b/test/test_data_util.cmake
@@ -10,6 +10,7 @@
 #
 
 list(APPEND AOM_TEST_DATA_FILE_NAMES
+            "desktopqvga2.320_240.yuv"
             "desktop1.320_180.yuv"
             "hantro_collage_w176h144.yuv"
             "hantro_collage_w352h288.yuv"
@@ -518,6 +519,8 @@ if(CONFIG_AV1_DECODER)
               "av1-1-b8-05-mv.ivf.md5"
               "av1-1-b8-06-mfmv.ivf"
               "av1-1-b8-06-mfmv.ivf.md5"
+              "av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf"
+              "av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf.md5"
               "av1-1-b8-22-svc-L2T1.ivf"
               "av1-1-b8-22-svc-L2T1.ivf.md5"
               "av1-1-b8-22-svc-L1T2.ivf"
@@ -549,7 +552,7 @@ if(CONFIG_AV1_DECODER)
               "invalid-oss-fuzz-10705.ivf"
               "invalid-oss-fuzz-10705.ivf.res"
               "invalid-oss-fuzz-10723.ivf"
-              "invalid-oss-fuzz-10723.ivf.res.2"
+              "invalid-oss-fuzz-10723.ivf.res.3"
               "invalid-oss-fuzz-10779.ivf"
               "invalid-oss-fuzz-10779.ivf.res"
               "invalid-oss-fuzz-11477.ivf"
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index ea5733040..bf90d4adb 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -461,12 +461,6 @@ INTRA_PRED_TEST(SSSE3, TX_4X16, nullptr, nullptr, nullptr, nullptr, nullptr,
                 aom_smooth_h_predictor_4x16_ssse3)
 #endif  // HAVE_SSSE3
 
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, nullptr, nullptr,
-                nullptr, nullptr, aom_h_predictor_4x4_dspr2, nullptr, nullptr,
-                nullptr, nullptr)
-#endif  // HAVE_DSPR2
-
 #if HAVE_NEON
 INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
                 aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
@@ -485,13 +479,6 @@ INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr, nullptr,
                 aom_smooth_h_predictor_4x16_neon)
 #endif  // HAVE_NEON
 
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa,
-                aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
-                aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
-                aom_h_predictor_4x4_msa, nullptr, nullptr, nullptr, nullptr)
-#endif  // HAVE_MSA
-
 // -----------------------------------------------------------------------------
 // 8x8, 8x4, 8x16, 8x32
 
@@ -561,12 +548,6 @@ INTRA_PRED_TEST(SSSE3, TX_8X32, nullptr, nullptr, nullptr, nullptr, nullptr,
                 aom_smooth_h_predictor_8x32_ssse3)
 #endif  // HAVE_SSSE3
 
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, nullptr, nullptr,
-                nullptr, nullptr, aom_h_predictor_8x8_dspr2, nullptr, nullptr,
-                nullptr, nullptr)
-#endif  // HAVE_DSPR2
-
 #if HAVE_NEON
 INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
                 aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
@@ -590,13 +571,6 @@ INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr, nullptr,
                 aom_smooth_h_predictor_8x32_neon)
 #endif  // HAVE_NEON
 
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa,
-                aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
-                aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
-                aom_h_predictor_8x8_msa, nullptr, nullptr, nullptr, nullptr)
-#endif  // HAVE_MSA
-
 // -----------------------------------------------------------------------------
 // 16x16, 16x8, 16x32, 16x4, 16x64
 
@@ -700,12 +674,6 @@ INTRA_PRED_TEST(AVX2, TX_16X64, nullptr, nullptr, nullptr, nullptr, nullptr,
                 nullptr)
 #endif  // HAVE_AVX2
 
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TX_16X16, aom_dc_predictor_16x16_dspr2, nullptr, nullptr,
-                nullptr, nullptr, aom_h_predictor_16x16_dspr2, nullptr, nullptr,
-                nullptr, nullptr)
-#endif  // HAVE_DSPR2
-
 #if HAVE_NEON
 INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
                 aom_dc_left_predictor_16x16_neon,
@@ -737,13 +705,6 @@ INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr, nullptr,
                 aom_smooth_h_predictor_16x64_neon)
 #endif  // HAVE_NEON
 
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa,
-                aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
-                aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
-                aom_h_predictor_16x16_msa, nullptr, nullptr, nullptr, nullptr)
-#endif  // HAVE_MSA
-
 // -----------------------------------------------------------------------------
 // 32x32, 32x16, 32x64, 32x8
 
@@ -864,13 +825,6 @@ INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr, nullptr,
                 aom_smooth_h_predictor_32x8_neon)
 #endif  // HAVE_NEON
 
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa,
-                aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
-                aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
-                aom_h_predictor_32x32_msa, nullptr, nullptr, nullptr, nullptr)
-#endif  // HAVE_MSA
-
 // -----------------------------------------------------------------------------
 // 64x64, 64x32, 64x16
 
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index c38461e05..09736d1ed 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -16,249 +16,252 @@ namespace libaom_test {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
 #if CONFIG_AV1_DECODER
-const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf",
-                                        "av1-1-b8-00-quantizer-01.ivf",
-                                        "av1-1-b8-00-quantizer-02.ivf",
-                                        "av1-1-b8-00-quantizer-03.ivf",
-                                        "av1-1-b8-00-quantizer-04.ivf",
-                                        "av1-1-b8-00-quantizer-05.ivf",
-                                        "av1-1-b8-00-quantizer-06.ivf",
-                                        "av1-1-b8-00-quantizer-07.ivf",
-                                        "av1-1-b8-00-quantizer-08.ivf",
-                                        "av1-1-b8-00-quantizer-09.ivf",
-                                        "av1-1-b8-00-quantizer-10.ivf",
-                                        "av1-1-b8-00-quantizer-11.ivf",
-                                        "av1-1-b8-00-quantizer-12.ivf",
-                                        "av1-1-b8-00-quantizer-13.ivf",
-                                        "av1-1-b8-00-quantizer-14.ivf",
-                                        "av1-1-b8-00-quantizer-15.ivf",
-                                        "av1-1-b8-00-quantizer-16.ivf",
-                                        "av1-1-b8-00-quantizer-17.ivf",
-                                        "av1-1-b8-00-quantizer-18.ivf",
-                                        "av1-1-b8-00-quantizer-19.ivf",
-                                        "av1-1-b8-00-quantizer-20.ivf",
-                                        "av1-1-b8-00-quantizer-21.ivf",
-                                        "av1-1-b8-00-quantizer-22.ivf",
-                                        "av1-1-b8-00-quantizer-23.ivf",
-                                        "av1-1-b8-00-quantizer-24.ivf",
-                                        "av1-1-b8-00-quantizer-25.ivf",
-                                        "av1-1-b8-00-quantizer-26.ivf",
-                                        "av1-1-b8-00-quantizer-27.ivf",
-                                        "av1-1-b8-00-quantizer-28.ivf",
-                                        "av1-1-b8-00-quantizer-29.ivf",
-                                        "av1-1-b8-00-quantizer-30.ivf",
-                                        "av1-1-b8-00-quantizer-31.ivf",
-                                        "av1-1-b8-00-quantizer-32.ivf",
-                                        "av1-1-b8-00-quantizer-33.ivf",
-                                        "av1-1-b8-00-quantizer-34.ivf",
-                                        "av1-1-b8-00-quantizer-35.ivf",
-                                        "av1-1-b8-00-quantizer-36.ivf",
-                                        "av1-1-b8-00-quantizer-37.ivf",
-                                        "av1-1-b8-00-quantizer-38.ivf",
-                                        "av1-1-b8-00-quantizer-39.ivf",
-                                        "av1-1-b8-00-quantizer-40.ivf",
-                                        "av1-1-b8-00-quantizer-41.ivf",
-                                        "av1-1-b8-00-quantizer-42.ivf",
-                                        "av1-1-b8-00-quantizer-43.ivf",
-                                        "av1-1-b8-00-quantizer-44.ivf",
-                                        "av1-1-b8-00-quantizer-45.ivf",
-                                        "av1-1-b8-00-quantizer-46.ivf",
-                                        "av1-1-b8-00-quantizer-47.ivf",
-                                        "av1-1-b8-00-quantizer-48.ivf",
-                                        "av1-1-b8-00-quantizer-49.ivf",
-                                        "av1-1-b8-00-quantizer-50.ivf",
-                                        "av1-1-b8-00-quantizer-51.ivf",
-                                        "av1-1-b8-00-quantizer-52.ivf",
-                                        "av1-1-b8-00-quantizer-53.ivf",
-                                        "av1-1-b8-00-quantizer-54.ivf",
-                                        "av1-1-b8-00-quantizer-55.ivf",
-                                        "av1-1-b8-00-quantizer-56.ivf",
-                                        "av1-1-b8-00-quantizer-57.ivf",
-                                        "av1-1-b8-00-quantizer-58.ivf",
-                                        "av1-1-b8-00-quantizer-59.ivf",
-                                        "av1-1-b8-00-quantizer-60.ivf",
-                                        "av1-1-b8-00-quantizer-61.ivf",
-                                        "av1-1-b8-00-quantizer-62.ivf",
-                                        "av1-1-b8-00-quantizer-63.ivf",
+const char *const kAV1TestVectors[] = {
+  "av1-1-b8-00-quantizer-00.ivf",
+  "av1-1-b8-00-quantizer-01.ivf",
+  "av1-1-b8-00-quantizer-02.ivf",
+  "av1-1-b8-00-quantizer-03.ivf",
+  "av1-1-b8-00-quantizer-04.ivf",
+  "av1-1-b8-00-quantizer-05.ivf",
+  "av1-1-b8-00-quantizer-06.ivf",
+  "av1-1-b8-00-quantizer-07.ivf",
+  "av1-1-b8-00-quantizer-08.ivf",
+  "av1-1-b8-00-quantizer-09.ivf",
+  "av1-1-b8-00-quantizer-10.ivf",
+  "av1-1-b8-00-quantizer-11.ivf",
+  "av1-1-b8-00-quantizer-12.ivf",
+  "av1-1-b8-00-quantizer-13.ivf",
+  "av1-1-b8-00-quantizer-14.ivf",
+  "av1-1-b8-00-quantizer-15.ivf",
+  "av1-1-b8-00-quantizer-16.ivf",
+  "av1-1-b8-00-quantizer-17.ivf",
+  "av1-1-b8-00-quantizer-18.ivf",
+  "av1-1-b8-00-quantizer-19.ivf",
+  "av1-1-b8-00-quantizer-20.ivf",
+  "av1-1-b8-00-quantizer-21.ivf",
+  "av1-1-b8-00-quantizer-22.ivf",
+  "av1-1-b8-00-quantizer-23.ivf",
+  "av1-1-b8-00-quantizer-24.ivf",
+  "av1-1-b8-00-quantizer-25.ivf",
+  "av1-1-b8-00-quantizer-26.ivf",
+  "av1-1-b8-00-quantizer-27.ivf",
+  "av1-1-b8-00-quantizer-28.ivf",
+  "av1-1-b8-00-quantizer-29.ivf",
+  "av1-1-b8-00-quantizer-30.ivf",
+  "av1-1-b8-00-quantizer-31.ivf",
+  "av1-1-b8-00-quantizer-32.ivf",
+  "av1-1-b8-00-quantizer-33.ivf",
+  "av1-1-b8-00-quantizer-34.ivf",
+  "av1-1-b8-00-quantizer-35.ivf",
+  "av1-1-b8-00-quantizer-36.ivf",
+  "av1-1-b8-00-quantizer-37.ivf",
+  "av1-1-b8-00-quantizer-38.ivf",
+  "av1-1-b8-00-quantizer-39.ivf",
+  "av1-1-b8-00-quantizer-40.ivf",
+  "av1-1-b8-00-quantizer-41.ivf",
+  "av1-1-b8-00-quantizer-42.ivf",
+  "av1-1-b8-00-quantizer-43.ivf",
+  "av1-1-b8-00-quantizer-44.ivf",
+  "av1-1-b8-00-quantizer-45.ivf",
+  "av1-1-b8-00-quantizer-46.ivf",
+  "av1-1-b8-00-quantizer-47.ivf",
+  "av1-1-b8-00-quantizer-48.ivf",
+  "av1-1-b8-00-quantizer-49.ivf",
+  "av1-1-b8-00-quantizer-50.ivf",
+  "av1-1-b8-00-quantizer-51.ivf",
+  "av1-1-b8-00-quantizer-52.ivf",
+  "av1-1-b8-00-quantizer-53.ivf",
+  "av1-1-b8-00-quantizer-54.ivf",
+  "av1-1-b8-00-quantizer-55.ivf",
+  "av1-1-b8-00-quantizer-56.ivf",
+  "av1-1-b8-00-quantizer-57.ivf",
+  "av1-1-b8-00-quantizer-58.ivf",
+  "av1-1-b8-00-quantizer-59.ivf",
+  "av1-1-b8-00-quantizer-60.ivf",
+  "av1-1-b8-00-quantizer-61.ivf",
+  "av1-1-b8-00-quantizer-62.ivf",
+  "av1-1-b8-00-quantizer-63.ivf",
 #if CONFIG_AV1_HIGHBITDEPTH
-                                        "av1-1-b10-00-quantizer-00.ivf",
-                                        "av1-1-b10-00-quantizer-01.ivf",
-                                        "av1-1-b10-00-quantizer-02.ivf",
-                                        "av1-1-b10-00-quantizer-03.ivf",
-                                        "av1-1-b10-00-quantizer-04.ivf",
-                                        "av1-1-b10-00-quantizer-05.ivf",
-                                        "av1-1-b10-00-quantizer-06.ivf",
-                                        "av1-1-b10-00-quantizer-07.ivf",
-                                        "av1-1-b10-00-quantizer-08.ivf",
-                                        "av1-1-b10-00-quantizer-09.ivf",
-                                        "av1-1-b10-00-quantizer-10.ivf",
-                                        "av1-1-b10-00-quantizer-11.ivf",
-                                        "av1-1-b10-00-quantizer-12.ivf",
-                                        "av1-1-b10-00-quantizer-13.ivf",
-                                        "av1-1-b10-00-quantizer-14.ivf",
-                                        "av1-1-b10-00-quantizer-15.ivf",
-                                        "av1-1-b10-00-quantizer-16.ivf",
-                                        "av1-1-b10-00-quantizer-17.ivf",
-                                        "av1-1-b10-00-quantizer-18.ivf",
-                                        "av1-1-b10-00-quantizer-19.ivf",
-                                        "av1-1-b10-00-quantizer-20.ivf",
-                                        "av1-1-b10-00-quantizer-21.ivf",
-                                        "av1-1-b10-00-quantizer-22.ivf",
-                                        "av1-1-b10-00-quantizer-23.ivf",
-                                        "av1-1-b10-00-quantizer-24.ivf",
-                                        "av1-1-b10-00-quantizer-25.ivf",
-                                        "av1-1-b10-00-quantizer-26.ivf",
-                                        "av1-1-b10-00-quantizer-27.ivf",
-                                        "av1-1-b10-00-quantizer-28.ivf",
-                                        "av1-1-b10-00-quantizer-29.ivf",
-                                        "av1-1-b10-00-quantizer-30.ivf",
-                                        "av1-1-b10-00-quantizer-31.ivf",
-                                        "av1-1-b10-00-quantizer-32.ivf",
-                                        "av1-1-b10-00-quantizer-33.ivf",
-                                        "av1-1-b10-00-quantizer-34.ivf",
-                                        "av1-1-b10-00-quantizer-35.ivf",
-                                        "av1-1-b10-00-quantizer-36.ivf",
-                                        "av1-1-b10-00-quantizer-37.ivf",
-                                        "av1-1-b10-00-quantizer-38.ivf",
-                                        "av1-1-b10-00-quantizer-39.ivf",
-                                        "av1-1-b10-00-quantizer-40.ivf",
-                                        "av1-1-b10-00-quantizer-41.ivf",
-                                        "av1-1-b10-00-quantizer-42.ivf",
-                                        "av1-1-b10-00-quantizer-43.ivf",
-                                        "av1-1-b10-00-quantizer-44.ivf",
-                                        "av1-1-b10-00-quantizer-45.ivf",
-                                        "av1-1-b10-00-quantizer-46.ivf",
-                                        "av1-1-b10-00-quantizer-47.ivf",
-                                        "av1-1-b10-00-quantizer-48.ivf",
-                                        "av1-1-b10-00-quantizer-49.ivf",
-                                        "av1-1-b10-00-quantizer-50.ivf",
-                                        "av1-1-b10-00-quantizer-51.ivf",
-                                        "av1-1-b10-00-quantizer-52.ivf",
-                                        "av1-1-b10-00-quantizer-53.ivf",
-                                        "av1-1-b10-00-quantizer-54.ivf",
-                                        "av1-1-b10-00-quantizer-55.ivf",
-                                        "av1-1-b10-00-quantizer-56.ivf",
-                                        "av1-1-b10-00-quantizer-57.ivf",
-                                        "av1-1-b10-00-quantizer-58.ivf",
-                                        "av1-1-b10-00-quantizer-59.ivf",
-                                        "av1-1-b10-00-quantizer-60.ivf",
-                                        "av1-1-b10-00-quantizer-61.ivf",
-                                        "av1-1-b10-00-quantizer-62.ivf",
-                                        "av1-1-b10-00-quantizer-63.ivf",
-                                        "av1-1-b10-23-film_grain-50.ivf",
-                                        "av1-1-b10-24-monochrome.ivf",
+  "av1-1-b10-00-quantizer-00.ivf",
+  "av1-1-b10-00-quantizer-01.ivf",
+  "av1-1-b10-00-quantizer-02.ivf",
+  "av1-1-b10-00-quantizer-03.ivf",
+  "av1-1-b10-00-quantizer-04.ivf",
+  "av1-1-b10-00-quantizer-05.ivf",
+  "av1-1-b10-00-quantizer-06.ivf",
+  "av1-1-b10-00-quantizer-07.ivf",
+  "av1-1-b10-00-quantizer-08.ivf",
+  "av1-1-b10-00-quantizer-09.ivf",
+  "av1-1-b10-00-quantizer-10.ivf",
+  "av1-1-b10-00-quantizer-11.ivf",
+  "av1-1-b10-00-quantizer-12.ivf",
+  "av1-1-b10-00-quantizer-13.ivf",
+  "av1-1-b10-00-quantizer-14.ivf",
+  "av1-1-b10-00-quantizer-15.ivf",
+  "av1-1-b10-00-quantizer-16.ivf",
+  "av1-1-b10-00-quantizer-17.ivf",
+  "av1-1-b10-00-quantizer-18.ivf",
+  "av1-1-b10-00-quantizer-19.ivf",
+  "av1-1-b10-00-quantizer-20.ivf",
+  "av1-1-b10-00-quantizer-21.ivf",
+  "av1-1-b10-00-quantizer-22.ivf",
+  "av1-1-b10-00-quantizer-23.ivf",
+  "av1-1-b10-00-quantizer-24.ivf",
+  "av1-1-b10-00-quantizer-25.ivf",
+  "av1-1-b10-00-quantizer-26.ivf",
+  "av1-1-b10-00-quantizer-27.ivf",
+  "av1-1-b10-00-quantizer-28.ivf",
+  "av1-1-b10-00-quantizer-29.ivf",
+  "av1-1-b10-00-quantizer-30.ivf",
+  "av1-1-b10-00-quantizer-31.ivf",
+  "av1-1-b10-00-quantizer-32.ivf",
+  "av1-1-b10-00-quantizer-33.ivf",
+  "av1-1-b10-00-quantizer-34.ivf",
+  "av1-1-b10-00-quantizer-35.ivf",
+  "av1-1-b10-00-quantizer-36.ivf",
+  "av1-1-b10-00-quantizer-37.ivf",
+  "av1-1-b10-00-quantizer-38.ivf",
+  "av1-1-b10-00-quantizer-39.ivf",
+  "av1-1-b10-00-quantizer-40.ivf",
+  "av1-1-b10-00-quantizer-41.ivf",
+  "av1-1-b10-00-quantizer-42.ivf",
+  "av1-1-b10-00-quantizer-43.ivf",
+  "av1-1-b10-00-quantizer-44.ivf",
+  "av1-1-b10-00-quantizer-45.ivf",
+  "av1-1-b10-00-quantizer-46.ivf",
+  "av1-1-b10-00-quantizer-47.ivf",
+  "av1-1-b10-00-quantizer-48.ivf",
+  "av1-1-b10-00-quantizer-49.ivf",
+  "av1-1-b10-00-quantizer-50.ivf",
+  "av1-1-b10-00-quantizer-51.ivf",
+  "av1-1-b10-00-quantizer-52.ivf",
+  "av1-1-b10-00-quantizer-53.ivf",
+  "av1-1-b10-00-quantizer-54.ivf",
+  "av1-1-b10-00-quantizer-55.ivf",
+  "av1-1-b10-00-quantizer-56.ivf",
+  "av1-1-b10-00-quantizer-57.ivf",
+  "av1-1-b10-00-quantizer-58.ivf",
+  "av1-1-b10-00-quantizer-59.ivf",
+  "av1-1-b10-00-quantizer-60.ivf",
+  "av1-1-b10-00-quantizer-61.ivf",
+  "av1-1-b10-00-quantizer-62.ivf",
+  "av1-1-b10-00-quantizer-63.ivf",
+  "av1-1-b10-23-film_grain-50.ivf",
+  "av1-1-b10-24-monochrome.ivf",
 #endif  // CONFIG_AV1_HIGHBITDEPTH
-                                        "av1-1-b8-01-size-16x16.ivf",
-                                        "av1-1-b8-01-size-16x18.ivf",
-                                        "av1-1-b8-01-size-16x32.ivf",
-                                        "av1-1-b8-01-size-16x34.ivf",
-                                        "av1-1-b8-01-size-16x64.ivf",
-                                        "av1-1-b8-01-size-16x66.ivf",
-                                        "av1-1-b8-01-size-18x16.ivf",
-                                        "av1-1-b8-01-size-18x18.ivf",
-                                        "av1-1-b8-01-size-18x32.ivf",
-                                        "av1-1-b8-01-size-18x34.ivf",
-                                        "av1-1-b8-01-size-18x64.ivf",
-                                        "av1-1-b8-01-size-18x66.ivf",
-                                        "av1-1-b8-01-size-196x196.ivf",
-                                        "av1-1-b8-01-size-196x198.ivf",
-                                        "av1-1-b8-01-size-196x200.ivf",
-                                        "av1-1-b8-01-size-196x202.ivf",
-                                        "av1-1-b8-01-size-196x208.ivf",
-                                        "av1-1-b8-01-size-196x210.ivf",
-                                        "av1-1-b8-01-size-196x224.ivf",
-                                        "av1-1-b8-01-size-196x226.ivf",
-                                        "av1-1-b8-01-size-198x196.ivf",
-                                        "av1-1-b8-01-size-198x198.ivf",
-                                        "av1-1-b8-01-size-198x200.ivf",
-                                        "av1-1-b8-01-size-198x202.ivf",
-                                        "av1-1-b8-01-size-198x208.ivf",
-                                        "av1-1-b8-01-size-198x210.ivf",
-                                        "av1-1-b8-01-size-198x224.ivf",
-                                        "av1-1-b8-01-size-198x226.ivf",
-                                        "av1-1-b8-01-size-200x196.ivf",
-                                        "av1-1-b8-01-size-200x198.ivf",
-                                        "av1-1-b8-01-size-200x200.ivf",
-                                        "av1-1-b8-01-size-200x202.ivf",
-                                        "av1-1-b8-01-size-200x208.ivf",
-                                        "av1-1-b8-01-size-200x210.ivf",
-                                        "av1-1-b8-01-size-200x224.ivf",
-                                        "av1-1-b8-01-size-200x226.ivf",
-                                        "av1-1-b8-01-size-202x196.ivf",
-                                        "av1-1-b8-01-size-202x198.ivf",
-                                        "av1-1-b8-01-size-202x200.ivf",
-                                        "av1-1-b8-01-size-202x202.ivf",
-                                        "av1-1-b8-01-size-202x208.ivf",
-                                        "av1-1-b8-01-size-202x210.ivf",
-                                        "av1-1-b8-01-size-202x224.ivf",
-                                        "av1-1-b8-01-size-202x226.ivf",
-                                        "av1-1-b8-01-size-208x196.ivf",
-                                        "av1-1-b8-01-size-208x198.ivf",
-                                        "av1-1-b8-01-size-208x200.ivf",
-                                        "av1-1-b8-01-size-208x202.ivf",
-                                        "av1-1-b8-01-size-208x208.ivf",
-                                        "av1-1-b8-01-size-208x210.ivf",
-                                        "av1-1-b8-01-size-208x224.ivf",
-                                        "av1-1-b8-01-size-208x226.ivf",
-                                        "av1-1-b8-01-size-210x196.ivf",
-                                        "av1-1-b8-01-size-210x198.ivf",
-                                        "av1-1-b8-01-size-210x200.ivf",
-                                        "av1-1-b8-01-size-210x202.ivf",
-                                        "av1-1-b8-01-size-210x208.ivf",
-                                        "av1-1-b8-01-size-210x210.ivf",
-                                        "av1-1-b8-01-size-210x224.ivf",
-                                        "av1-1-b8-01-size-210x226.ivf",
-                                        "av1-1-b8-01-size-224x196.ivf",
-                                        "av1-1-b8-01-size-224x198.ivf",
-                                        "av1-1-b8-01-size-224x200.ivf",
-                                        "av1-1-b8-01-size-224x202.ivf",
-                                        "av1-1-b8-01-size-224x208.ivf",
-                                        "av1-1-b8-01-size-224x210.ivf",
-                                        "av1-1-b8-01-size-224x224.ivf",
-                                        "av1-1-b8-01-size-224x226.ivf",
-                                        "av1-1-b8-01-size-226x196.ivf",
-                                        "av1-1-b8-01-size-226x198.ivf",
-                                        "av1-1-b8-01-size-226x200.ivf",
-                                        "av1-1-b8-01-size-226x202.ivf",
-                                        "av1-1-b8-01-size-226x208.ivf",
-                                        "av1-1-b8-01-size-226x210.ivf",
-                                        "av1-1-b8-01-size-226x224.ivf",
-                                        "av1-1-b8-01-size-226x226.ivf",
-                                        "av1-1-b8-01-size-32x16.ivf",
-                                        "av1-1-b8-01-size-32x18.ivf",
-                                        "av1-1-b8-01-size-32x32.ivf",
-                                        "av1-1-b8-01-size-32x34.ivf",
-                                        "av1-1-b8-01-size-32x64.ivf",
-                                        "av1-1-b8-01-size-32x66.ivf",
-                                        "av1-1-b8-01-size-34x16.ivf",
-                                        "av1-1-b8-01-size-34x18.ivf",
-                                        "av1-1-b8-01-size-34x32.ivf",
-                                        "av1-1-b8-01-size-34x34.ivf",
-                                        "av1-1-b8-01-size-34x64.ivf",
-                                        "av1-1-b8-01-size-34x66.ivf",
-                                        "av1-1-b8-01-size-64x16.ivf",
-                                        "av1-1-b8-01-size-64x18.ivf",
-                                        "av1-1-b8-01-size-64x32.ivf",
-                                        "av1-1-b8-01-size-64x34.ivf",
-                                        "av1-1-b8-01-size-64x64.ivf",
-                                        "av1-1-b8-01-size-64x66.ivf",
-                                        "av1-1-b8-01-size-66x16.ivf",
-                                        "av1-1-b8-01-size-66x18.ivf",
-                                        "av1-1-b8-01-size-66x32.ivf",
-                                        "av1-1-b8-01-size-66x34.ivf",
-                                        "av1-1-b8-01-size-66x64.ivf",
-                                        "av1-1-b8-01-size-66x66.ivf",
-                                        "av1-1-b8-02-allintra.ivf",
-                                        "av1-1-b8-03-sizedown.mkv",
-                                        "av1-1-b8-03-sizeup.mkv",
-                                        "av1-1-b8-04-cdfupdate.ivf",
-                                        "av1-1-b8-05-mv.ivf",
-                                        "av1-1-b8-06-mfmv.ivf",
-                                        "av1-1-b8-22-svc-L1T2.ivf",
-                                        "av1-1-b8-22-svc-L2T1.ivf",
-                                        "av1-1-b8-22-svc-L2T2.ivf",
-                                        "av1-1-b8-23-film_grain-50.ivf",
-                                        "av1-1-b8-24-monochrome.ivf" };
+  "av1-1-b8-01-size-16x16.ivf",
+  "av1-1-b8-01-size-16x18.ivf",
+  "av1-1-b8-01-size-16x32.ivf",
+  "av1-1-b8-01-size-16x34.ivf",
+  "av1-1-b8-01-size-16x64.ivf",
+  "av1-1-b8-01-size-16x66.ivf",
+  "av1-1-b8-01-size-18x16.ivf",
+  "av1-1-b8-01-size-18x18.ivf",
+  "av1-1-b8-01-size-18x32.ivf",
+  "av1-1-b8-01-size-18x34.ivf",
+  "av1-1-b8-01-size-18x64.ivf",
+  "av1-1-b8-01-size-18x66.ivf",
+  "av1-1-b8-01-size-196x196.ivf",
+  "av1-1-b8-01-size-196x198.ivf",
+  "av1-1-b8-01-size-196x200.ivf",
+  "av1-1-b8-01-size-196x202.ivf",
+  "av1-1-b8-01-size-196x208.ivf",
+  "av1-1-b8-01-size-196x210.ivf",
+  "av1-1-b8-01-size-196x224.ivf",
+  "av1-1-b8-01-size-196x226.ivf",
+  "av1-1-b8-01-size-198x196.ivf",
+  "av1-1-b8-01-size-198x198.ivf",
+  "av1-1-b8-01-size-198x200.ivf",
+  "av1-1-b8-01-size-198x202.ivf",
+  "av1-1-b8-01-size-198x208.ivf",
+  "av1-1-b8-01-size-198x210.ivf",
+  "av1-1-b8-01-size-198x224.ivf",
+  "av1-1-b8-01-size-198x226.ivf",
+  "av1-1-b8-01-size-200x196.ivf",
+  "av1-1-b8-01-size-200x198.ivf",
+  "av1-1-b8-01-size-200x200.ivf",
+  "av1-1-b8-01-size-200x202.ivf",
+  "av1-1-b8-01-size-200x208.ivf",
+  "av1-1-b8-01-size-200x210.ivf",
+  "av1-1-b8-01-size-200x224.ivf",
+  "av1-1-b8-01-size-200x226.ivf",
+  "av1-1-b8-01-size-202x196.ivf",
+  "av1-1-b8-01-size-202x198.ivf",
+  "av1-1-b8-01-size-202x200.ivf",
+  "av1-1-b8-01-size-202x202.ivf",
+  "av1-1-b8-01-size-202x208.ivf",
+  "av1-1-b8-01-size-202x210.ivf",
+  "av1-1-b8-01-size-202x224.ivf",
+  "av1-1-b8-01-size-202x226.ivf",
+  "av1-1-b8-01-size-208x196.ivf",
+  "av1-1-b8-01-size-208x198.ivf",
+  "av1-1-b8-01-size-208x200.ivf",
+  "av1-1-b8-01-size-208x202.ivf",
+  "av1-1-b8-01-size-208x208.ivf",
+  "av1-1-b8-01-size-208x210.ivf",
+  "av1-1-b8-01-size-208x224.ivf",
+  "av1-1-b8-01-size-208x226.ivf",
+  "av1-1-b8-01-size-210x196.ivf",
+  "av1-1-b8-01-size-210x198.ivf",
+  "av1-1-b8-01-size-210x200.ivf",
+  "av1-1-b8-01-size-210x202.ivf",
+  "av1-1-b8-01-size-210x208.ivf",
+  "av1-1-b8-01-size-210x210.ivf",
+  "av1-1-b8-01-size-210x224.ivf",
+  "av1-1-b8-01-size-210x226.ivf",
+  "av1-1-b8-01-size-224x196.ivf",
+  "av1-1-b8-01-size-224x198.ivf",
+  "av1-1-b8-01-size-224x200.ivf",
+  "av1-1-b8-01-size-224x202.ivf",
+  "av1-1-b8-01-size-224x208.ivf",
+  "av1-1-b8-01-size-224x210.ivf",
+  "av1-1-b8-01-size-224x224.ivf",
+  "av1-1-b8-01-size-224x226.ivf",
+  "av1-1-b8-01-size-226x196.ivf",
+  "av1-1-b8-01-size-226x198.ivf",
+  "av1-1-b8-01-size-226x200.ivf",
+  "av1-1-b8-01-size-226x202.ivf",
+  "av1-1-b8-01-size-226x208.ivf",
+  "av1-1-b8-01-size-226x210.ivf",
+  "av1-1-b8-01-size-226x224.ivf",
+  "av1-1-b8-01-size-226x226.ivf",
+  "av1-1-b8-01-size-32x16.ivf",
+  "av1-1-b8-01-size-32x18.ivf",
+  "av1-1-b8-01-size-32x32.ivf",
+  "av1-1-b8-01-size-32x34.ivf",
+  "av1-1-b8-01-size-32x64.ivf",
+  "av1-1-b8-01-size-32x66.ivf",
+  "av1-1-b8-01-size-34x16.ivf",
+  "av1-1-b8-01-size-34x18.ivf",
+  "av1-1-b8-01-size-34x32.ivf",
+  "av1-1-b8-01-size-34x34.ivf",
+  "av1-1-b8-01-size-34x64.ivf",
+  "av1-1-b8-01-size-34x66.ivf",
+  "av1-1-b8-01-size-64x16.ivf",
+  "av1-1-b8-01-size-64x18.ivf",
+  "av1-1-b8-01-size-64x32.ivf",
+  "av1-1-b8-01-size-64x34.ivf",
+  "av1-1-b8-01-size-64x64.ivf",
+  "av1-1-b8-01-size-64x66.ivf",
+  "av1-1-b8-01-size-66x16.ivf",
+  "av1-1-b8-01-size-66x18.ivf",
+  "av1-1-b8-01-size-66x32.ivf",
+  "av1-1-b8-01-size-66x34.ivf",
+  "av1-1-b8-01-size-66x64.ivf",
+  "av1-1-b8-01-size-66x66.ivf",
+  "av1-1-b8-02-allintra.ivf",
+  "av1-1-b8-03-sizedown.mkv",
+  "av1-1-b8-03-sizeup.mkv",
+  "av1-1-b8-04-cdfupdate.ivf",
+  "av1-1-b8-05-mv.ivf",
+  "av1-1-b8-06-mfmv.ivf",
+  "av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf",
+  "av1-1-b8-22-svc-L1T2.ivf",
+  "av1-1-b8-22-svc-L2T1.ivf",
+  "av1-1-b8-22-svc-L2T2.ivf",
+  "av1-1-b8-23-film_grain-50.ivf",
+  "av1-1-b8-24-monochrome.ivf"
+};
 const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
 #endif  // CONFIG_AV1_DECODER
 
diff --git a/test/variance_test.cc b/test/variance_test.cc
index e96f933e0..25b8c8dd2 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -26,17 +26,22 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
+#include "av1/common/cdef_block.h"
 
 namespace {
 
 typedef uint64_t (*MseWxH16bitFunc)(uint8_t *dst, int dstride, uint16_t *src,
                                     int sstride, int w, int h);
+typedef uint64_t (*Mse16xH16bitFunc)(uint8_t *dst, int dstride, uint16_t *src,
+                                     int w, int h);
 typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
                                         const uint8_t *b, int b_stride,
                                         unsigned int *sse);
 typedef void (*GetSseSum8x8QuadFunc)(const uint8_t *a, int a_stride,
                                      const uint8_t *b, int b_stride,
-                                     unsigned int *sse, int *sum);
+                                     uint32_t *sse8x8, int *sum8x8,
+                                     unsigned int *tot_sse, int *tot_sum,
+                                     uint32_t *var8x8);
 typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
                                          int xoffset, int yoffset,
                                          const uint8_t *b, int b_stride,
@@ -513,6 +518,139 @@ void MseWxHTestClass<MseWxHFunctionType>::RefMatchTestMse() {
   }
 }
 
+template <typename FunctionType>
+class Mse16xHTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  // Memory required to compute mse of two 8x8 and four 4x4 blocks assigned for
+  // maximum width 16 and maximum height 8.
+  int mem_size = 16 * 8;
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, mem_size * sizeof(*src_)));
+    dst_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(16, mem_size * sizeof(*dst_)));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(dst_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_);
+    src_ = nullptr;
+    dst_ = nullptr;
+  }
+
+  uint8_t RandBool() {
+    const uint32_t value = rnd_.Rand8();
+    return (value & 0x1);
+  }
+
+ protected:
+  void RefMatchExtremeTestMse();
+  void RefMatchTestMse();
+  void SpeedTest();
+
+ protected:
+  ACMRandom rnd_;
+  uint8_t *dst_;
+  uint16_t *src_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  int d_stride() const { return params_.width; }
+};
+
+template <typename Mse16xHFunctionType>
+void Mse16xHTestClass<Mse16xHFunctionType>::SpeedTest() {
+  aom_usec_timer ref_timer, test_timer;
+  double elapsed_time_c = 0.0;
+  double elapsed_time_simd = 0.0;
+  const int loop_count = 10000000;
+  const int w = width();
+  const int h = height();
+  const int dstride = d_stride();
+
+  for (int k = 0; k < mem_size; ++k) {
+    dst_[k] = rnd_.Rand8();
+    // Right shift by 6 is done to generate more input in range of [0,255] than
+    // CDEF_VERY_LARGE
+    int rnd_i10 = rnd_.Rand16() >> 6;
+    src_[k] = (rnd_i10 < 256) ? rnd_i10 : CDEF_VERY_LARGE;
+  }
+
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < loop_count; i++) {
+    aom_mse_16xh_16bit_c(dst_, dstride, src_, w, h);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < loop_count; i++) {
+    params_.func(dst_, dstride, src_, w, h);
+  }
+  aom_usec_timer_mark(&test_timer);
+  elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%.31f\n", width(),
+         height(), elapsed_time_c, elapsed_time_simd,
+         (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename Mse16xHFunctionType>
+void Mse16xHTestClass<Mse16xHFunctionType>::RefMatchTestMse() {
+  uint64_t mse_ref = 0;
+  uint64_t mse_mod = 0;
+  const int w = width();
+  const int h = height();
+  const int dstride = d_stride();
+
+  for (int i = 0; i < 10; i++) {
+    for (int k = 0; k < mem_size; ++k) {
+      dst_[k] = rnd_.Rand8();
+      // Right shift by 6 is done to generate more input in range of [0,255]
+      // than CDEF_VERY_LARGE
+      int rnd_i10 = rnd_.Rand16() >> 6;
+      src_[k] = (rnd_i10 < 256) ? rnd_i10 : CDEF_VERY_LARGE;
+    }
+
+    API_REGISTER_STATE_CHECK(
+        mse_ref = aom_mse_16xh_16bit_c(dst_, dstride, src_, w, h));
+    API_REGISTER_STATE_CHECK(mse_mod = params_.func(dst_, dstride, src_, w, h));
+    EXPECT_EQ(mse_ref, mse_mod)
+        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+  }
+}
+
+template <typename Mse16xHFunctionType>
+void Mse16xHTestClass<Mse16xHFunctionType>::RefMatchExtremeTestMse() {
+  uint64_t mse_ref = 0;
+  uint64_t mse_mod = 0;
+  const int w = width();
+  const int h = height();
+  const int dstride = d_stride();
+  const int iter = 10;
+
+  // Fill the buffers with extreme values
+  for (int i = 0; i < iter; i++) {
+    for (int k = 0; k < mem_size; ++k) {
+      dst_[k] = static_cast<uint8_t>(RandBool() ? 0 : 255);
+      src_[k] = static_cast<uint16_t>(RandBool() ? 0 : CDEF_VERY_LARGE);
+    }
+
+    API_REGISTER_STATE_CHECK(
+        mse_ref = aom_mse_16xh_16bit_c(dst_, dstride, src_, w, h));
+    API_REGISTER_STATE_CHECK(mse_mod = params_.func(dst_, dstride, src_, w, h));
+    EXPECT_EQ(mse_ref, mse_mod)
+        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+  }
+}
+
 // Main class for testing a function type
 template <typename FunctionType>
 class MainTestClass
@@ -723,26 +861,34 @@ void MainTestClass<GetSseSum8x8QuadFuncType>::RefTestSseSum() {
     }
     unsigned int sse1[256] = { 0 };
     unsigned int sse2[256] = { 0 };
+    unsigned int var1[256] = { 0 };
+    unsigned int var2[256] = { 0 };
     int sum1[256] = { 0 };
     int sum2[256] = { 0 };
+    unsigned int sse_tot_c = 0;
+    unsigned int sse_tot_simd = 0;
+    int sum_tot_c = 0;
+    int sum_tot_simd = 0;
     const int stride = width();
     int k = 0;
 
     for (int i = 0; i < height(); i += 8) {
       for (int j = 0; j < width(); j += 32) {
-        API_REGISTER_STATE_CHECK(params_.func(src_ + stride * i + j, stride,
-                                              ref_ + stride * i + j, stride,
-                                              &sse1[k], &sum1[k]));
-        aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
-                                   ref_ + stride * i + j, stride, &sse2[k],
-                                   &sum2[k]);
+        API_REGISTER_STATE_CHECK(params_.func(
+            src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+            &sse1[k], &sum1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+        aom_get_var_sse_sum_8x8_quad_c(
+            src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+            &sse2[k], &sum2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
         k += 4;
       }
     }
-
+    EXPECT_EQ(sse_tot_c, sse_tot_simd);
+    EXPECT_EQ(sum_tot_c, sum_tot_simd);
     for (int p = 0; p < 256; p++) {
       EXPECT_EQ(sse1[p], sse2[p]);
       EXPECT_EQ(sum1[p], sum2[p]);
+      EXPECT_EQ(var1[p], var2[p]);
     }
   }
 }
@@ -753,26 +899,34 @@ void MainTestClass<GetSseSum8x8QuadFuncType>::MinTestSseSum() {
   memset(ref_, 255, block_size());
   unsigned int sse1[256] = { 0 };
   unsigned int sse2[256] = { 0 };
+  unsigned int var1[256] = { 0 };
+  unsigned int var2[256] = { 0 };
   int sum1[256] = { 0 };
   int sum2[256] = { 0 };
+  unsigned int sse_tot_c = 0;
+  unsigned int sse_tot_simd = 0;
+  int sum_tot_c = 0;
+  int sum_tot_simd = 0;
   const int stride = width();
   int k = 0;
 
   for (int i = 0; i < height(); i += 8) {
     for (int j = 0; j < width(); j += 32) {
-      API_REGISTER_STATE_CHECK(params_.func(src_ + stride * i + j, stride,
-                                            ref_ + stride * i + j, stride,
-                                            &sse1[k], &sum1[k]));
-      aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
-                                 ref_ + stride * i + j, stride, &sse2[k],
-                                 &sum2[k]);
+      API_REGISTER_STATE_CHECK(params_.func(
+          src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+          &sse1[k], &sum1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+      aom_get_var_sse_sum_8x8_quad_c(
+          src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+          &sse2[k], &sum2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
       k += 4;
     }
   }
-
+  EXPECT_EQ(sse_tot_simd, sse_tot_c);
+  EXPECT_EQ(sum_tot_simd, sum_tot_c);
   for (int p = 0; p < 256; p++) {
     EXPECT_EQ(sse1[p], sse2[p]);
     EXPECT_EQ(sum1[p], sum2[p]);
+    EXPECT_EQ(var1[p], var2[p]);
   }
 }
 
@@ -782,26 +936,35 @@ void MainTestClass<GetSseSum8x8QuadFuncType>::MaxTestSseSum() {
   memset(ref_, 0, block_size());
   unsigned int sse1[256] = { 0 };
   unsigned int sse2[256] = { 0 };
+  unsigned int var1[256] = { 0 };
+  unsigned int var2[256] = { 0 };
   int sum1[256] = { 0 };
   int sum2[256] = { 0 };
+  unsigned int sse_tot_c = 0;
+  unsigned int sse_tot_simd = 0;
+  int sum_tot_c = 0;
+  int sum_tot_simd = 0;
   const int stride = width();
   int k = 0;
 
   for (int i = 0; i < height(); i += 8) {
     for (int j = 0; j < width(); j += 32) {
-      API_REGISTER_STATE_CHECK(params_.func(src_ + stride * i + j, stride,
-                                            ref_ + stride * i + j, stride,
-                                            &sse1[k], &sum1[k]));
-      aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
-                                 ref_ + stride * i + j, stride, &sse2[k],
-                                 &sum2[k]);
+      API_REGISTER_STATE_CHECK(params_.func(
+          src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+          &sse1[k], &sum1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+      aom_get_var_sse_sum_8x8_quad_c(
+          src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+          &sse2[k], &sum2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
       k += 4;
     }
   }
+  EXPECT_EQ(sse_tot_c, sse_tot_simd);
+  EXPECT_EQ(sum_tot_c, sum_tot_simd);
 
   for (int p = 0; p < 256; p++) {
     EXPECT_EQ(sse1[p], sse2[p]);
     EXPECT_EQ(sum1[p], sum2[p]);
+    EXPECT_EQ(var1[p], var2[p]);
   }
 }
 
@@ -813,21 +976,26 @@ void MainTestClass<GetSseSum8x8QuadFuncType>::SseSum_SpeedTest() {
     ref_[j] = rnd_.Rand8();
   }
 
-  unsigned int sse1[4] = { 0 };
-  unsigned int sse2[4] = { 0 };
-  int sum1[4] = { 0 };
-  int sum2[4] = { 0 };
+  unsigned int sse1 = 0;
+  unsigned int sse2 = 0;
+  unsigned int var1 = 0;
+  unsigned int var2 = 0;
+  int sum1 = 0;
+  int sum2 = 0;
+  unsigned int sse_tot_c = 0;
+  unsigned int sse_tot_simd = 0;
+  int sum_tot_c = 0;
+  int sum_tot_simd = 0;
   const int stride = width();
-  const int k = 0;
 
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
   for (int r = 0; r < loop_count; ++r) {
     for (int i = 0; i < height(); i += 8) {
       for (int j = 0; j < width(); j += 32) {
-        aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
-                                   ref_ + stride * i + j, stride, &sse2[k],
-                                   &sum2[k]);
+        aom_get_var_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
+                                       ref_ + stride * i + j, stride, &sse2,
+                                       &sum2, &sse_tot_c, &sum_tot_c, &var2);
       }
     }
   }
@@ -840,7 +1008,7 @@ void MainTestClass<GetSseSum8x8QuadFuncType>::SseSum_SpeedTest() {
     for (int i = 0; i < height(); i += 8) {
       for (int j = 0; j < width(); j += 32) {
         params_.func(src_ + stride * i + j, stride, ref_ + stride * i + j,
-                     stride, &sse1[k], &sum1[k]);
+                     stride, &sse1, &sum1, &sse_tot_simd, &sum_tot_simd, &var1);
       }
     }
   }
@@ -1327,6 +1495,7 @@ void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
 #endif  // !CONFIG_REALTIME_ONLY
 
 typedef MseWxHTestClass<MseWxH16bitFunc> MseWxHTest;
+typedef Mse16xHTestClass<Mse16xH16bitFunc> Mse16xHTest;
 typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
@@ -1339,11 +1508,15 @@ typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
 #endif
 typedef TestParams<MseWxH16bitFunc> MseWxHParams;
+typedef TestParams<Mse16xH16bitFunc> Mse16xHParams;
 
 TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
 TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
 TEST_P(MseWxHTest, RefMse) { RefMatchTestMse(); }
 TEST_P(MseWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
+TEST_P(Mse16xHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(Mse16xHTest, RefMseExtreme) { RefMatchExtremeTestMse(); }
+TEST_P(Mse16xHTest, DISABLED_SpeedMse) { SpeedTest(); }
 TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
 TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
@@ -1375,6 +1548,13 @@ INSTANTIATE_TEST_SUITE_P(
                       MseWxHParams(2, 3, &aom_mse_wxh_16bit_c, 8),
                       MseWxHParams(2, 2, &aom_mse_wxh_16bit_c, 8)));
 
+INSTANTIATE_TEST_SUITE_P(
+    C, Mse16xHTest,
+    ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_c, 8),
+                      Mse16xHParams(3, 2, &aom_mse_16xh_16bit_c, 8),
+                      Mse16xHParams(2, 3, &aom_mse_16xh_16bit_c, 8),
+                      Mse16xHParams(2, 2, &aom_mse_16xh_16bit_c, 8)));
+
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_c));
 
@@ -1422,10 +1602,10 @@ INSTANTIATE_TEST_SUITE_P(C, AvxVarianceTest,
 
 typedef TestParams<GetSseSum8x8QuadFunc> GetSseSumParams;
 const GetSseSumParams kArrayGetSseSum8x8Quad_c[] = {
-  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_c, 0),
-  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_c, 0),
-  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_c, 0),
-  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_c, 0)
+  GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_c, 0),
+  GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_c, 0),
+  GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_c, 0),
+  GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_c, 0)
 };
 INSTANTIATE_TEST_SUITE_P(C, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_c));
@@ -2117,6 +2297,13 @@ INSTANTIATE_TEST_SUITE_P(
                       MseWxHParams(2, 3, &aom_mse_wxh_16bit_sse2, 8),
                       MseWxHParams(2, 2, &aom_mse_wxh_16bit_sse2, 8)));
 
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, Mse16xHTest,
+    ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_sse2, 8),
+                      Mse16xHParams(3, 2, &aom_mse_16xh_16bit_sse2, 8),
+                      Mse16xHParams(2, 3, &aom_mse_16xh_16bit_sse2, 8),
+                      Mse16xHParams(2, 2, &aom_mse_16xh_16bit_sse2, 8)));
+
 INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_sse2));
 
@@ -2156,10 +2343,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvxVarianceTest,
                          ::testing::ValuesIn(kArrayVariance_sse2));
 
 const GetSseSumParams kArrayGetSseSum8x8Quad_sse2[] = {
-  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_sse2, 0),
-  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_sse2, 0),
-  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_sse2, 0),
-  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_sse2, 0)
+  GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_sse2, 0),
+  GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_sse2, 0),
+  GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_sse2, 0),
+  GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_sse2, 0)
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_sse2));
@@ -2740,6 +2927,13 @@ INSTANTIATE_TEST_SUITE_P(
                       MseWxHParams(2, 3, &aom_mse_wxh_16bit_avx2, 8),
                       MseWxHParams(2, 2, &aom_mse_wxh_16bit_avx2, 8)));
 
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Mse16xHTest,
+    ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_avx2, 8),
+                      Mse16xHParams(3, 2, &aom_mse_16xh_16bit_avx2, 8),
+                      Mse16xHParams(2, 3, &aom_mse_16xh_16bit_avx2, 8),
+                      Mse16xHParams(2, 2, &aom_mse_16xh_16bit_avx2, 8)));
+
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
                          ::testing::Values(MseParams(4, 4,
                                                      &aom_mse16x16_avx2)));
@@ -2767,10 +2961,10 @@ INSTANTIATE_TEST_SUITE_P(AVX2, AvxVarianceTest,
                          ::testing::ValuesIn(kArrayVariance_avx2));
 
 const GetSseSumParams kArrayGetSseSum8x8Quad_avx2[] = {
-  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_avx2, 0),
-  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_avx2, 0),
-  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_avx2, 0),
-  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_avx2, 0)
+  GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_avx2, 0),
+  GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_avx2, 0),
+  GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_avx2, 0),
+  GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_avx2, 0)
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_avx2));
@@ -2814,33 +3008,53 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, MseWxHTest,
+    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_neon, 8),
+                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_neon, 8),
+                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_neon, 8),
+                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_neon, 8)));
+
 INSTANTIATE_TEST_SUITE_P(NEON, AvxSseTest,
                          ::testing::Values(SseParams(2, 2,
                                                      &aom_get4x4sse_cs_neon)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4,
-                                                     &aom_mse16x16_neon)));
+                         ::testing::Values(MseParams(3, 3, &aom_mse8x8_neon),
+                                           MseParams(3, 4, &aom_mse8x16_neon),
+                                           MseParams(4, 4, &aom_mse16x16_neon),
+                                           MseParams(4, 3, &aom_mse16x8_neon)));
+
+const VarianceParams kArrayVariance_neon[] = {
+  VarianceParams(7, 7, &aom_variance128x128_neon),
+  VarianceParams(6, 6, &aom_variance64x64_neon),
+  VarianceParams(7, 6, &aom_variance128x64_neon),
+  VarianceParams(6, 7, &aom_variance64x128_neon),
+  VarianceParams(6, 6, &aom_variance64x64_neon),
+  VarianceParams(6, 5, &aom_variance64x32_neon),
+  VarianceParams(5, 6, &aom_variance32x64_neon),
+  VarianceParams(5, 5, &aom_variance32x32_neon),
+  VarianceParams(5, 4, &aom_variance32x16_neon),
+  VarianceParams(4, 5, &aom_variance16x32_neon),
+  VarianceParams(4, 4, &aom_variance16x16_neon),
+  VarianceParams(4, 3, &aom_variance16x8_neon),
+  VarianceParams(3, 4, &aom_variance8x16_neon),
+  VarianceParams(3, 3, &aom_variance8x8_neon),
+  VarianceParams(3, 2, &aom_variance8x4_neon),
+  VarianceParams(2, 3, &aom_variance4x8_neon),
+  VarianceParams(2, 2, &aom_variance4x4_neon),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(2, 4, &aom_variance4x16_neon),
+  VarianceParams(4, 2, &aom_variance16x4_neon),
+  VarianceParams(3, 5, &aom_variance8x32_neon),
+  VarianceParams(5, 3, &aom_variance32x8_neon),
+  VarianceParams(4, 6, &aom_variance16x64_neon),
+  VarianceParams(6, 4, &aom_variance64x16_neon),
+#endif
+};
 
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
-                      VarianceParams(6, 6, &aom_variance64x64_neon),
-                      VarianceParams(7, 6, &aom_variance128x64_neon),
-                      VarianceParams(6, 7, &aom_variance64x128_neon),
-                      VarianceParams(6, 6, &aom_variance64x64_neon),
-                      VarianceParams(6, 5, &aom_variance64x32_neon),
-                      VarianceParams(5, 6, &aom_variance32x64_neon),
-                      VarianceParams(5, 5, &aom_variance32x32_neon),
-                      VarianceParams(5, 4, &aom_variance32x16_neon),
-                      VarianceParams(4, 5, &aom_variance16x32_neon),
-                      VarianceParams(4, 4, &aom_variance16x16_neon),
-                      VarianceParams(4, 3, &aom_variance16x8_neon),
-                      VarianceParams(3, 4, &aom_variance8x16_neon),
-                      VarianceParams(3, 3, &aom_variance8x8_neon),
-                      VarianceParams(3, 2, &aom_variance8x4_neon),
-                      VarianceParams(2, 3, &aom_variance4x8_neon),
-                      VarianceParams(2, 2, &aom_variance4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(NEON, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_neon));
 
 const SubpelVarianceParams kArraySubpelVariance_neon[] = {
   SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon, 0),
@@ -2871,11 +3085,40 @@ const SubpelVarianceParams kArraySubpelVariance_neon[] = {
 INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelVarianceTest,
                          ::testing::ValuesIn(kArraySubpelVariance_neon));
 
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_neon[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_neon, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_neon, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_neon, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_neon, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_neon, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_neon, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_neon, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_neon, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_neon, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_neon, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_neon, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_neon, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_neon, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_neon, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_neon, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_neon, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_neon, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_neon, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_neon, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_neon, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_neon, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_neon));
+
 const GetSseSumParams kArrayGetSseSum8x8Quad_neon[] = {
-  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_neon, 0),
-  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_neon, 0),
-  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_neon, 0),
-  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_neon, 0)
+  GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon, 0),
+  GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon, 0),
+  GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_neon, 0),
+  GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_neon, 0)
 };
 INSTANTIATE_TEST_SUITE_P(NEON, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon));
@@ -2914,68 +3157,4 @@ INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDVarianceTest,
 
 #endif  // HAVE_NEON
 
-#if HAVE_MSA
-INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
-                         ::testing::Values(aom_get_mb_ss_msa));
-
-INSTANTIATE_TEST_SUITE_P(MSA, AvxSseTest,
-                         ::testing::Values(SseParams(2, 2,
-                                                     &aom_get4x4sse_cs_msa)));
-
-INSTANTIATE_TEST_SUITE_P(MSA, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
-                                           MseParams(4, 3, &aom_mse16x8_msa),
-                                           MseParams(3, 4, &aom_mse8x16_msa),
-                                           MseParams(3, 3, &aom_mse8x8_msa)));
-
-INSTANTIATE_TEST_SUITE_P(
-    MSA, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa),
-                      VarianceParams(6, 5, &aom_variance64x32_msa),
-                      VarianceParams(5, 6, &aom_variance32x64_msa),
-                      VarianceParams(5, 5, &aom_variance32x32_msa),
-                      VarianceParams(5, 4, &aom_variance32x16_msa),
-                      VarianceParams(4, 5, &aom_variance16x32_msa),
-                      VarianceParams(4, 4, &aom_variance16x16_msa),
-                      VarianceParams(4, 3, &aom_variance16x8_msa),
-                      VarianceParams(3, 4, &aom_variance8x16_msa),
-                      VarianceParams(3, 3, &aom_variance8x8_msa),
-                      VarianceParams(3, 2, &aom_variance8x4_msa),
-                      VarianceParams(2, 3, &aom_variance4x8_msa),
-                      VarianceParams(2, 2, &aom_variance4x4_msa)));
-
-INSTANTIATE_TEST_SUITE_P(
-    MSA, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_msa, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_msa, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_msa, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_msa, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_msa, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_msa, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_msa, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_msa, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_msa, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_msa, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_msa, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_msa, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_msa, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    MSA, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_msa, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_msa, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_msa, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_msa, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_msa, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_msa, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_msa, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_msa, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_msa, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_msa, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_msa, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_msa, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_msa, 0)));
-#endif  // HAVE_MSA
 }  // namespace
diff --git a/test/video_source.h b/test/video_source.h
index 742178edd..7d953f489 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -215,7 +215,7 @@ class DummyVideoSource : public VideoSource {
     aom_img_free(img_);
     img_ = aom_img_alloc(nullptr, format_, width_, height_, 32);
     ASSERT_NE(img_, nullptr);
-    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+    raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
 
   aom_image_t *img_;
@@ -232,7 +232,6 @@ class RandomVideoSource : public DummyVideoSource {
   RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
       : rnd_(seed), seed_(seed) {}
 
- protected:
   // Reset the RNG to get a matching stream for the second pass
   virtual void Begin() {
     frame_ = 0;
@@ -240,6 +239,7 @@ class RandomVideoSource : public DummyVideoSource {
     FillFrame();
   }
 
+ protected:
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
   virtual void FillFrame() {
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 69df5ea91..d44dd9253 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -10,6 +10,7 @@
  */
 
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -221,10 +222,12 @@ void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) {
   // will always be multiples of 64 when called from non-test code.
   // If in future any new requirements are added, these lines will
   // need changing.
-  const int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
+  int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
   int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
-  const int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
+  if (h_start > h_end) std::swap(h_start, h_end);
+  int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
   int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+  if (v_start > v_end) std::swap(v_start, v_end);
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = run_times == 1 ? kIterations : 2;
@@ -551,10 +554,12 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
   // will always be multiples of 64 when called from non-test code.
   // If in future any new requirements are added, these lines will
   // need changing.
-  const int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
+  int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
   int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
-  const int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
+  if (h_start > h_end) std::swap(h_start, h_end);
+  int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
   int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+  if (v_start > v_end) std::swap(v_start, v_end);
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = run_times == 1 ? kIterations : 2;
diff --git a/third_party/SVT-AV1/EbMemory_AVX2.h b/third_party/SVT-AV1/EbMemory_AVX2.h
new file mode 100644
index 000000000..0d0ea10ab
--- /dev/null
+++ b/third_party/SVT-AV1/EbMemory_AVX2.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright(c) 2019 Intel Corporation
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at https://www.aomedia.org/license/software-license. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * https://www.aomedia.org/license/patent-license.
+ */
+
+#ifndef AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_
+#define AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
+  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_setr_m128i
+#define _mm256_setr_m128i(/* __m128i */ lo, /* __m128i */ hi) \
+  _mm256_set_m128i((hi), (lo))
+#endif
+
+static INLINE __m256i load_u8_4x2_avx2(const uint8_t *const src,
+                                       const ptrdiff_t stride) {
+  __m128i src01;
+  src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride));
+  src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
+  return _mm256_setr_m128i(src01, _mm_setzero_si128());
+}
+
+static INLINE __m256i load_u8_4x4_avx2(const uint8_t *const src,
+                                       const ptrdiff_t stride) {
+  __m128i src01, src23;
+  src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride));
+  src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
+  src23 = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
+  src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
+  return _mm256_setr_m128i(src01, src23);
+}
+
+static INLINE __m256i load_u8_8x2_avx2(const uint8_t *const src,
+                                       const ptrdiff_t stride) {
+  const __m128i src0 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
+  const __m128i src1 = _mm_loadl_epi64((__m128i *)(src + 1 * stride));
+  return _mm256_setr_m128i(src0, src1);
+}
+
+static INLINE __m256i load_u8_8x4_avx2(const uint8_t *const src,
+                                       const ptrdiff_t stride) {
+  __m128i src01, src23;
+  src01 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
+  src01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src01),
+                                        (double *)(void *)(src + 1 * stride)));
+  src23 = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
+  src23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src23),
+                                        (double *)(void *)(src + 3 * stride)));
+  return _mm256_setr_m128i(src01, src23);
+}
+
+static INLINE __m256i loadu_8bit_16x2_avx2(const void *const src,
+                                           const ptrdiff_t strideInByte) {
+  const __m128i src0 = _mm_loadu_si128((__m128i *)src);
+  const __m128i src1 =
+      _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
+  return _mm256_setr_m128i(src0, src1);
+}
+
+static INLINE __m256i loadu_u8_16x2_avx2(const uint8_t *const src,
+                                         const ptrdiff_t stride) {
+  return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
+}
+
+static INLINE __m256i loadu_u16_8x2_avx2(const uint16_t *const src,
+                                         const ptrdiff_t stride) {
+  return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
+}
+
+static INLINE void storeu_8bit_16x2_avx2(const __m256i src, void *const dst,
+                                         const ptrdiff_t strideInByte) {
+  const __m128i d0 = _mm256_castsi256_si128(src);
+  const __m128i d1 = _mm256_extracti128_si256(src, 1);
+  _mm_storeu_si128((__m128i *)dst, d0);
+  _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
+}
+
+static INLINE void storeu_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
+                                       const ptrdiff_t stride) {
+  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
+}
+
+static INLINE void storeu_s16_8x2_avx2(const __m256i src, int16_t *const dst,
+                                       const ptrdiff_t stride) {
+  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
+}
+
+static INLINE void storeu_u16_8x2_avx2(const __m256i src, uint16_t *const dst,
+                                       const ptrdiff_t stride) {
+  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
+}
+
+#endif  // AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_
diff --git a/third_party/SVT-AV1/EbMemory_SSE4_1.h b/third_party/SVT-AV1/EbMemory_SSE4_1.h
new file mode 100644
index 000000000..d821d9a30
--- /dev/null
+++ b/third_party/SVT-AV1/EbMemory_SSE4_1.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2019 Intel Corporation
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at https://www.aomedia.org/license/software-license. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * https://www.aomedia.org/license/patent-license.
+ */
+
+#ifndef AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_SSE4_1_H_
+#define AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_SSE4_1_H_
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i load8bit_4x2_sse4_1(const void *const src,
+                                          const ptrdiff_t strideInByte) {
+  const __m128i s = _mm_cvtsi32_si128(*(int32_t *)((uint8_t *)src));
+  return _mm_insert_epi32(s, *(int32_t *)((uint8_t *)src + strideInByte), 1);
+}
+
+static INLINE __m128i load_u8_4x2_sse4_1(const uint8_t *const src,
+                                         const ptrdiff_t stride) {
+  return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
+}
+
+static INLINE __m128i load_u16_2x2_sse4_1(const uint16_t *const src,
+                                          const ptrdiff_t stride) {
+  return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
+}
+
+#endif  // AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_SSE4_1_H_
diff --git a/third_party/SVT-AV1/LICENSE.md b/third_party/SVT-AV1/LICENSE.md
new file mode 100644
index 000000000..aff96d15e
--- /dev/null
+++ b/third_party/SVT-AV1/LICENSE.md
@@ -0,0 +1,32 @@
+BSD 3-Clause Clear License
+The Clear BSD License
+
+Copyright (c) 2021, Alliance for Open Media
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the disclaimer below)
+provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the Alliance for Open Media nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/SVT-AV1/PATENTS.md b/third_party/SVT-AV1/PATENTS.md
new file mode 100644
index 000000000..1de4dd753
--- /dev/null
+++ b/third_party/SVT-AV1/PATENTS.md
@@ -0,0 +1,107 @@
+**Alliance for Open Media Patent License 1.0**
+
+ 1. **License Terms.**
+
+    **Patent License.** Subject to the terms and conditions of this License, each
+     Licensor, on behalf of itself and successors in interest and assigns,
+     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+     no-charge, royalty-free, irrevocable (except as expressly stated in this
+     License) patent license to its Necessary Claims to make, use, sell, offer
+     for sale, import or distribute any Implementation.
+
+     **Conditions.**
+
+    *Availability.* As a condition to the grant of rights to Licensee to make,
+       sell, offer for sale, import or distribute an Implementation under
+       Section 1.1, Licensee must make its Necessary Claims available under
+       this License, and must reproduce this License with any Implementation
+       as follows:
+
+          a. For distribution in source code, by including this License in the
+          root directory of the source code with its Implementation.
+
+          b. For distribution in any other form (including binary, object form,
+          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+          GDSII, etc.)), by including this License in the documentation, legal
+          notices, and/or other written materials provided with the
+          Implementation.
+
+    *Additional Conditions.* This license is directly from Licensor to
+       Licensee.  Licensee acknowledges as a condition of benefiting from it
+       that no rights from Licensor are received from suppliers, distributors,
+       or otherwise in connection with this License.
+
+    **Defensive Termination**. If any Licensee, its Affiliates, or its agents
+     initiates patent litigation or files, maintains, or voluntarily
+     participates in a lawsuit against another entity or any person asserting
+     that any Implementation infringes Necessary Claims, any patent licenses
+     granted under this License directly to the Licensee are immediately
+     terminated as of the date of the initiation of action unless 1) that suit
+     was in response to a corresponding suit regarding an Implementation first
+     brought against an initiating entity, or 2) that suit was brought to
+     enforce the terms of this License (including intervention in a third-party
+     action by a Licensee).
+
+    **Disclaimers.** The Reference Implementation and Specification are provided
+     "AS IS" and without warranty. The entire risk as to implementing or
+     otherwise using the Reference Implementation or Specification is assumed
+     by the implementer and user. Licensor expressly disclaims any warranties
+     (express, implied, or otherwise), including implied warranties of
+     merchantability, non-infringement, fitness for a particular purpose, or
+     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. **Definitions.**
+
+     **Affiliate.**  "Affiliate" means an entity that directly or indirectly
+     Controls, is Controlled by, or is under common Control of that party.
+
+    **Control.** "Control" means direct or indirect control of more than 50% of
+     the voting power to elect directors of that corporation, or for any other
+     entity, the power to direct management of such entity.
+
+    **Decoder.**  "Decoder" means any decoder that conforms fully with all
+     non-optional portions of the Specification.
+
+    **Encoder.**  "Encoder" means any encoder that produces a bitstream that can
+     be decoded by a Decoder only to the extent it produces such a bitstream.
+
+    **Final Deliverable.**  "Final Deliverable" means the final version of a
+     deliverable approved by the Alliance for Open Media as a Final
+     Deliverable.
+
+    **Implementation.**  "Implementation" means any implementation, including the
+     Reference Implementation, that is an Encoder and/or a Decoder. An
+     Implementation also includes components of an Implementation only to the
+     extent they are used as part of an Implementation.
+
+    **License.** "License" means this license.
+
+    **Licensee.** "Licensee" means any person or entity who exercises patent
+     rights granted under this License.
+
+    **Licensor.**  "Licensor" means (i) any Licensee that makes, sells, offers
+     for sale, imports or distributes any Implementation, or (ii) a person
+     or entity that has a licensing obligation to the Implementation as a
+     result of its membership and/or participation in the Alliance for Open
+     Media working group that developed the Specification.
+
+    **Necessary Claims.**  "Necessary Claims" means all claims of patents or
+      patent applications, (a) that currently or at any time in the future,
+      are owned or controlled by the Licensor, and (b) (i) would be an
+      Essential Claim as defined by the W3C Policy as of February 5, 2004
+      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+      as if the Specification was a W3C Recommendation; or (ii) are infringed
+      by the Reference Implementation.
+
+     **Reference Implementation.** "Reference Implementation" means an Encoder
+      and/or Decoder released by the Alliance for Open Media as a Final
+      Deliverable.
+
+     **Specification.** "Specification" means the specification designated by
+      the Alliance for Open Media as a Final Deliverable for which this
+      License was issued.
diff --git a/third_party/SVT-AV1/README.libaom b/third_party/SVT-AV1/README.libaom
new file mode 100644
index 000000000..ff365057e
--- /dev/null
+++ b/third_party/SVT-AV1/README.libaom
@@ -0,0 +1,14 @@
+URL: https://gitlab.com/AOMediaCodec/SVT-AV1
+
+Version: 8ff99c90359330d2e807757c9425560bbc452ff3
+License: BSD-3-clause clear
+License File: LICENSE.md
+
+Description:
+Port the x86 intrinsics used for single reference convolve reconstructions.
+
+Local Changes:
+Only ported the functions pertinent to single reference convolves.
+All functions are made static inline to avoid function call overheads.
+References to some arrays are changed to libaom version when applicable.
+Some extra intrinsic functions are added to support missing block sizes.
diff --git a/third_party/SVT-AV1/convolve_2d_avx2.h b/third_party/SVT-AV1/convolve_2d_avx2.h
new file mode 100644
index 000000000..64cd810f7
--- /dev/null
+++ b/third_party/SVT-AV1/convolve_2d_avx2.h
@@ -0,0 +1,1199 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
+#define THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
+
+#include "convolve_avx2.h"
+
+static void convolve_2d_sr_hor_2tap_avx2(
+    const uint8_t *const src, const int32_t src_stride, const int32_t w,
+    const int32_t h, const InterpFilterParams *const filter_params_x,
+    const int32_t subpel_x_q4, int16_t *const im_block) {
+  const uint8_t *src_ptr = src;
+  int32_t y = h;
+  int16_t *im = im_block;
+
+  if (w <= 8) {
+    __m128i coeffs_128;
+
+    prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
+
+    if (w == 2) {
+      do {
+        const __m128i r =
+            x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
+        xy_x_round_store_2x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 2;
+        y -= 2;
+      } while (y);
+    } else if (w == 4) {
+      do {
+        const __m128i r =
+            x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
+        xy_x_round_store_4x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 4;
+        y -= 2;
+      } while (y);
+    } else {
+      assert(w == 8);
+
+      do {
+        __m128i r[2];
+
+        x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
+        xy_x_round_store_8x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 8;
+        y -= 2;
+      } while (y);
+    }
+  } else {
+    __m256i coeffs_256;
+
+    prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
+
+    if (w == 16) {
+      do {
+        __m256i r[2];
+
+        x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
+        xy_x_round_store_32_avx2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 16;
+        y -= 2;
+      } while (y);
+    } else if (w == 32) {
+      do {
+        xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
+        src_ptr += src_stride;
+        im += 32;
+      } while (--y);
+    } else if (w == 64) {
+      do {
+        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
+        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
+        src_ptr += src_stride;
+        im += 64;
+      } while (--y);
+    } else {
+      assert(w == 128);
+
+      do {
+        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
+        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
+        xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
+        xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
+        src_ptr += src_stride;
+        im += 128;
+      } while (--y);
+    }
+  }
+}
+
+static void convolve_2d_sr_hor_4tap_ssse3(
+    const uint8_t *const src, const int32_t src_stride, const int32_t w,
+    const int32_t h, const InterpFilterParams *const filter_params_x,
+    const int32_t subpel_x_q4, int16_t *const im_block) {
+  const uint8_t *src_ptr = src - 1;
+  int32_t y = h;
+  int16_t *im = im_block;
+
+  if (w <= 4) {
+    __m128i coeffs_128[2];
+
+    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
+    if (w == 2) {
+      do {
+        const __m128i r =
+            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
+        xy_x_round_store_2x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 2;
+        y -= 2;
+      } while (y);
+    } else if (w == 4) {
+      do {
+        const __m128i r =
+            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+        xy_x_round_store_4x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 4;
+        y -= 2;
+      } while (y);
+    }
+  } else {
+    // TODO(chiyotsai@google.com): Add better optimization
+    __m256i coeffs_256[2], filt_256[2];
+
+    prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+    filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+    filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+
+    if (w == 8) {
+      do {
+        __m256i res =
+            x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
+        xy_x_round_store_8x2_avx2(res, im);
+
+        src_ptr += 2 * src_stride;
+        im += 2 * 8;
+        y -= 2;
+      } while (y);
+    } else if (w == 16) {
+      do {
+        __m256i r[2];
+
+        x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
+        xy_x_round_store_32_avx2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 16;
+        y -= 2;
+      } while (y);
+    } else if (w == 32) {
+      do {
+        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+
+        src_ptr += src_stride;
+        im += 32;
+      } while (--y);
+    } else if (w == 64) {
+      do {
+        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+        src_ptr += src_stride;
+        im += 64;
+      } while (--y);
+    } else {
+      assert(w == 128);
+
+      do {
+        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+        xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
+        xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
+        src_ptr += src_stride;
+        im += 128;
+      } while (--y);
+    }
+  }
+}
+
+static void convolve_2d_sr_hor_6tap_avx2(
+    const uint8_t *const src, const int32_t src_stride, const int32_t w,
+    const int32_t h, const InterpFilterParams *const filter_params_x,
+    const int32_t subpel_x_q4, int16_t *const im_block) {
+  const uint8_t *src_ptr = src - 2;
+  int32_t y = h;
+  int16_t *im = im_block;
+
+  if (w <= 4) {
+    __m128i coeffs_128[3];
+
+    prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
+    if (w == 2) {
+      do {
+        const __m128i r =
+            x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
+        xy_x_round_store_2x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 2;
+        y -= 2;
+      } while (y);
+    } else if (w == 4) {
+      do {
+        const __m128i r =
+            x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+        xy_x_round_store_4x2_sse2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 4;
+        y -= 2;
+      } while (y);
+    }
+  } else {
+    __m256i coeffs_256[3], filt_256[3];
+
+    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
+
+    prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+    if (w == 8) {
+      do {
+        const __m256i res =
+            x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
+        xy_x_round_store_8x2_avx2(res, im);
+
+        src_ptr += 2 * src_stride;
+        im += 2 * 8;
+        y -= 2;
+      } while (y);
+    } else if (w == 16) {
+      do {
+        __m256i r[2];
+
+        x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
+        xy_x_round_store_32_avx2(r, im);
+        src_ptr += 2 * src_stride;
+        im += 2 * 16;
+        y -= 2;
+      } while (y);
+    } else if (w == 32) {
+      do {
+        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+        src_ptr += src_stride;
+        im += 32;
+      } while (--y);
+    } else if (w == 64) {
+      do {
+        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+        src_ptr += src_stride;
+        im += 64;
+      } while (--y);
+    } else {
+      assert(w == 128);
+
+      do {
+        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+        xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
+        xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
+        src_ptr += src_stride;
+        im += 128;
+      } while (--y);
+    }
+  }
+}
+
+static void convolve_2d_sr_hor_8tap_avx2(
+    const uint8_t *const src, const int32_t src_stride, const int32_t w,
+    const int32_t h, const InterpFilterParams *const filter_params_x,
+    const int32_t subpel_x_q4, int16_t *const im_block) {
+  const uint8_t *src_ptr = src - 3;
+  int32_t y = h;
+  int16_t *im = im_block;
+  __m256i coeffs_256[4], filt_256[4];
+
+  filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+  filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+  filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
+  filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+  if (w == 8) {
+    do {
+      const __m256i res =
+          x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
+      xy_x_round_store_8x2_avx2(res, im);
+      src_ptr += 2 * src_stride;
+      im += 2 * 8;
+      y -= 2;
+    } while (y);
+  } else if (w == 16) {
+    do {
+      __m256i r[2];
+
+      x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
+      xy_x_round_store_32_avx2(r, im);
+      src_ptr += 2 * src_stride;
+      im += 2 * 16;
+      y -= 2;
+    } while (y);
+  } else if (w == 32) {
+    do {
+      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+      src_ptr += src_stride;
+      im += 32;
+    } while (--y);
+  } else if (w == 64) {
+    do {
+      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+      src_ptr += src_stride;
+      im += 64;
+    } while (--y);
+  } else {
+    assert(w == 128);
+
+    do {
+      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+      xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
+      xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
+      src_ptr += src_stride;
+      im += 128;
+    } while (--y);
+  }
+}
+
+static void convolve_2d_sr_ver_2tap_avx2(
+    const int16_t *const im_block, const int32_t w, const int32_t h,
+    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+    uint8_t *dst, const int32_t dst_stride) {
+  const int16_t *im = im_block;
+  int32_t y = h;
+
+  if (w <= 4) {
+    __m128i coeffs_128;
+
+    prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
+
+    if (w == 2) {
+      __m128i s_32[2];
+
+      s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
+
+      do {
+        const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
+        xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+        im += 2 * 2;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else {
+      __m128i s_64[2], r[2];
+
+      assert(w == 4);
+
+      s_64[0] = _mm_loadl_epi64((__m128i *)im);
+
+      do {
+        xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
+        r[0] = xy_y_round_sse2(r[0]);
+        r[1] = xy_y_round_sse2(r[1]);
+        const __m128i rr = _mm_packs_epi32(r[0], r[1]);
+        pack_store_4x2_sse2(rr, dst, dst_stride);
+        im += 2 * 4;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    }
+  } else {
+    __m256i coeffs_256;
+
+    prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
+
+    if (w == 8) {
+      __m128i s_128[2];
+      __m256i r[2];
+
+      s_128[0] = _mm_loadu_si128((__m128i *)im);
+
+      do {
+        xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
+        xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+        im += 2 * 8;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 16) {
+      __m256i s_256[2], r[4];
+
+      s_256[0] = _mm256_loadu_si256((__m256i *)im);
+
+      do {
+        xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
+        xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+        im += 2 * 16;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 32) {
+      __m256i s_256[2][2];
+
+      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+
+      do {
+        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256,
+                                       dst);
+        im += 2 * 32;
+        xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256,
+                                       dst + dst_stride);
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 64) {
+      __m256i s_256[2][4];
+
+      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+      s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+      s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
+
+      do {
+        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0,
+                                       &coeffs_256, dst);
+        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2,
+                                       &coeffs_256, dst + 32);
+        im += 2 * 64;
+        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+                                       &coeffs_256, dst + dst_stride);
+        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
+                                       &coeffs_256, dst + dst_stride + 32);
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else {
+      __m256i s_256[2][8];
+
+      assert(w == 128);
+
+      load_16bit_8rows_avx2(im, 16, s_256[0]);
+
+      do {
+        xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0,
+                                       &coeffs_256, dst);
+        xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2,
+                                       &coeffs_256, dst + 1 * 32);
+        xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4,
+                                       &coeffs_256, dst + 2 * 32);
+        xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6,
+                                       &coeffs_256, dst + 3 * 32);
+        im += 2 * 128;
+        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+                                       &coeffs_256, dst + dst_stride);
+        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
+                                       &coeffs_256, dst + dst_stride + 1 * 32);
+        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4,
+                                       &coeffs_256, dst + dst_stride + 2 * 32);
+        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6,
+                                       &coeffs_256, dst + dst_stride + 3 * 32);
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    }
+  }
+}
+
+static void convolve_2d_sr_ver_2tap_half_avx2(
+    const int16_t *const im_block, const int32_t w, const int32_t h,
+    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+    uint8_t *dst, const int32_t dst_stride) {
+  const int16_t *im = im_block;
+  int32_t y = h;
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  if (w == 2) {
+    __m128i s_32[2];
+
+    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
+
+    do {
+      const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
+      const __m128i r = xy_y_round_half_pel_sse2(res);
+      pack_store_2x2_sse2(r, dst, dst_stride);
+      im += 2 * 2;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else if (w == 4) {
+    __m128i s_64[2];
+
+    s_64[0] = _mm_loadl_epi64((__m128i *)im);
+
+    do {
+      const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
+      const __m128i r = xy_y_round_half_pel_sse2(res);
+      pack_store_4x2_sse2(r, dst, dst_stride);
+      im += 2 * 4;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else if (w == 8) {
+    __m128i s_128[2];
+
+    s_128[0] = _mm_loadu_si128((__m128i *)im);
+
+    do {
+      const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
+      const __m256i r = xy_y_round_half_pel_avx2(res);
+      pack_store_8x2_avx2(r, dst, dst_stride);
+      im += 2 * 8;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else if (w == 16) {
+    __m256i s_256[2], r[2];
+
+    s_256[0] = _mm256_loadu_si256((__m256i *)im);
+
+    do {
+      xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
+      r[0] = xy_y_round_half_pel_avx2(r[0]);
+      r[1] = xy_y_round_half_pel_avx2(r[1]);
+      xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
+      im += 2 * 16;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else if (w == 32) {
+    __m256i s_256[2][2];
+
+    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+
+    do {
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0],
+                                              dst + dst_stride);
+      im += 2 * 32;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else if (w == 64) {
+    __m256i s_256[2][4];
+
+    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+    s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+    s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
+
+    do {
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0,
+                                              s_256[1] + 0, dst);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2,
+                                              s_256[1] + 2, dst + 32);
+      im += 2 * 64;
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+                                              dst + dst_stride);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(
+          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else {
+    __m256i s_256[2][8];
+
+    assert(w == 128);
+
+    load_16bit_8rows_avx2(im, 16, s_256[0]);
+
+    do {
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0,
+                                              s_256[1] + 0, dst);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2,
+                                              s_256[1] + 2, dst + 1 * 32);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4,
+                                              s_256[1] + 4, dst + 2 * 32);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6,
+                                              s_256[1] + 6, dst + 3 * 32);
+      im += 2 * 128;
+      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+                                              dst + dst_stride);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(
+          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(
+          im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
+      xy_y_convolve_2tap_half_pel_32_all_avx2(
+          im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  }
+}
+
+static void convolve_2d_sr_ver_4tap_avx2(
+    const int16_t *const im_block, const int32_t w, const int32_t h,
+    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+    uint8_t *dst, const int32_t dst_stride) {
+  const int16_t *im = im_block;
+  int32_t y = h;
+
+  if (w == 2) {
+    __m128i coeffs_128[2], s_32[4], ss_128[2];
+
+    prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
+
+    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
+    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
+    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
+
+    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+
+    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
+
+    do {
+      const __m128i res =
+          xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
+      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+      im += 2 * 2;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else {
+    __m256i coeffs_256[2];
+
+    prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+    if (w == 4) {
+      __m128i s_64[4];
+      __m256i s_256[2], ss_256[2];
+
+      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
+      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
+      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
+
+      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+
+      do {
+        const __m256i res =
+            xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
+        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
+        im += 2 * 4;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 8) {
+      __m256i s_256[4], r[2];
+
+      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
+      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
+
+      if (subpel_y_q4 != 8) {
+        __m256i ss_256[4];
+
+        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+
+        do {
+          xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
+          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+          im += 2 * 8;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        do {
+          xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
+          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+          im += 2 * 8;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else if (w == 16) {
+      __m256i s_256[5];
+
+      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+
+      if (subpel_y_q4 != 8) {
+        __m256i ss_256[4], tt_256[4], r[4];
+
+        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+
+        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+        tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+
+        do {
+          xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
+                                       r);
+          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+          im += 2 * 16;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m256i r[4];
+
+        do {
+          xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
+          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+          im += 2 * 16;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else {
+      /*It's a special condition for OBMC. A/c  to Av1 spec 4-tap won't
+      support for width(w)>16, but for OBMC while predicting above block
+      it reduces size block to Wx(h/2), for example, if above block size
+      is 32x8, we get block size as 32x4 for OBMC.*/
+      int32_t x = 0;
+
+      assert(!(w % 32));
+
+      __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
+      do {
+        const int16_t *s = im + x;
+        uint8_t *d = dst + x;
+
+        loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
+        loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1],
+                                      tt_256[1]);
+
+        y = h;
+        do {
+          xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
+                                       coeffs_256, r0);
+          xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1],
+                                       tt_256[1], coeffs_256, r1);
+
+          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
+          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
+
+          s += 2 * w;
+          d += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+
+        x += 32;
+      } while (x < w);
+    }
+  }
+}
+
+static void convolve_2d_sr_ver_6tap_avx2(
+    const int16_t *const im_block, const int32_t w, const int32_t h,
+    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+    uint8_t *dst, const int32_t dst_stride) {
+  const int16_t *im = im_block;
+  int32_t y;
+
+  if (w == 2) {
+    __m128i coeffs_128[3], s_32[6], ss_128[3];
+
+    prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
+    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
+    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
+    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
+    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
+
+    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+
+    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
+    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
+
+    y = h;
+    do {
+      const __m128i res =
+          xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
+      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+      im += 2 * 2;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else {
+    __m256i coeffs_256[3];
+
+    prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+    if (w == 4) {
+      __m128i s_64[6];
+      __m256i s_256[6], ss_256[3];
+
+      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
+      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
+      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
+      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
+      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
+      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
+      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
+
+      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+
+      y = h;
+      do {
+        const __m256i res =
+            xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
+        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
+        im += 2 * 4;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 8) {
+      __m256i s_256[6], r[2];
+
+      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
+      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
+      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
+      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
+      y = h;
+
+      if (subpel_y_q4 != 8) {
+        __m256i ss_256[6];
+
+        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+
+        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+
+        do {
+          xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
+          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+          im += 2 * 8;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        do {
+          xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
+          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+          im += 2 * 8;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else if (w == 16) {
+      __m256i s_256[6];
+
+      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
+      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
+      y = h;
+
+      if (subpel_y_q4 != 8) {
+        __m256i ss_256[6], tt_256[6], r[4];
+
+        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+
+        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+        tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
+        tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+        tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
+
+        do {
+          xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256,
+                                       coeffs_256, r);
+          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+          im += 2 * 16;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m256i ss_256[4], r[4];
+
+        do {
+          xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256,
+                                                coeffs_256, r);
+          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+          im += 2 * 16;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else {
+      int32_t x = 0;
+
+      assert(!(w % 32));
+
+      __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
+
+      do {
+        const int16_t *s = im + x;
+        uint8_t *d = dst + x;
+
+        loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
+        loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1],
+                                      tt_256[1]);
+
+        y = h;
+        do {
+          xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
+                                       coeffs_256, r0);
+          xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1],
+                                       tt_256[1], coeffs_256, r1);
+
+          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
+          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
+
+          s += 2 * w;
+          d += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+
+        x += 32;
+      } while (x < w);
+    }
+  }
+}
+
+static void convolve_2d_sr_ver_8tap_avx2(
+    const int16_t *const im_block, const int32_t w, const int32_t h,
+    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+    uint8_t *dst, const int32_t dst_stride) {
+  const int16_t *im = im_block;
+  int32_t y;
+
+  if (w == 2) {
+    __m128i coeffs_128[4], s_32[8], ss_128[4];
+
+    prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
+
+    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
+    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
+    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
+    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
+    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
+    s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
+    s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
+
+    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+    const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+    const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
+
+    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
+    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
+    ss_128[2] = _mm_unpacklo_epi16(src45, src56);
+
+    y = h;
+    do {
+      const __m128i res =
+          xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
+      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+      im += 2 * 2;
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+  } else {
+    __m256i coeffs_256[4];
+
+    prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+    if (w == 4) {
+      __m128i s_64[8];
+      __m256i s_256[8], ss_256[4];
+
+      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
+      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
+      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
+      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
+      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
+      s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
+      s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
+      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
+      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
+      s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
+      s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
+
+      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+      ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
+
+      y = h;
+      do {
+        const __m256i res =
+            xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
+        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
+        im += 2 * 4;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 8) {
+      __m256i s_256[8], r[2];
+
+      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
+      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
+      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
+      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
+      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
+      s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
+      y = h;
+
+      if (subpel_y_q4 != 8) {
+        __m256i ss_256[8];
+
+        convolve_8tap_unpack_avx2(s_256, ss_256);
+
+        do {
+          xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
+          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+          im += 2 * 8;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        do {
+          xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
+          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+          im += 2 * 8;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else if (w == 16) {
+      __m256i s_256[8], r[4];
+
+      load_16bit_7rows_avx2(im, 16, s_256);
+      y = h;
+
+      if (subpel_y_q4 != 8) {
+        __m256i ss_256[8], tt_256[8];
+
+        convolve_8tap_unpack_avx2(s_256, ss_256);
+        convolve_8tap_unpack_avx2(s_256 + 1, tt_256);
+
+        do {
+          xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256,
+                                       tt_256, r);
+          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+          im += 2 * 16;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        do {
+          xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r);
+          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+          im += 2 * 16;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else {
+      int32_t x = 0;
+      __m256i s_256[2][8], r0[4], r1[4];
+
+      assert(!(w % 32));
+
+      __m256i ss_256[2][8], tt_256[2][8];
+
+      do {
+        const int16_t *s = im + x;
+        uint8_t *d = dst + x;
+
+        load_16bit_7rows_avx2(s, w, s_256[0]);
+        convolve_8tap_unpack_avx2(s_256[0], ss_256[0]);
+        convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]);
+
+        load_16bit_7rows_avx2(s + 16, w, s_256[1]);
+        convolve_8tap_unpack_avx2(s_256[1], ss_256[1]);
+        convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]);
+
+        y = h;
+        do {
+          xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0],
+                                       tt_256[0], r0);
+          xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1],
+                                       ss_256[1], tt_256[1], r1);
+          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
+          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
+
+          s += 2 * w;
+          d += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+
+        x += 32;
+      } while (x < w);
+    }
+  }
+}
+
+typedef void (*Convolve2dSrHorTapFunc)(
+    const uint8_t *const src, const int32_t src_stride, const int32_t w,
+    const int32_t h, const InterpFilterParams *const filter_params_x,
+    const int32_t subpel_x_q4, int16_t *const im_block);
+
+typedef void (*Convolve2dSrVerTapFunc)(
+    const int16_t *const im_block, const int32_t w, const int32_t h,
+    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+    uint8_t *dst, const int32_t dst_stride);
+
+static AOM_FORCE_INLINE void av1_convolve_2d_sr_specialized_avx2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
+    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
+  static const Convolve2dSrHorTapFunc
+      convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
+        NULL,
+        NULL,
+        convolve_2d_sr_hor_2tap_avx2,
+        NULL,
+        convolve_2d_sr_hor_4tap_ssse3,
+        NULL,
+        convolve_2d_sr_hor_6tap_avx2,
+        NULL,
+        convolve_2d_sr_hor_8tap_avx2
+      };
+  static const Convolve2dSrVerTapFunc
+      convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
+        NULL,
+        convolve_2d_sr_ver_2tap_half_avx2,
+        convolve_2d_sr_ver_2tap_avx2,
+        convolve_2d_sr_ver_4tap_avx2,
+        convolve_2d_sr_ver_4tap_avx2,
+        convolve_2d_sr_ver_6tap_avx2,
+        convolve_2d_sr_ver_6tap_avx2,
+        convolve_2d_sr_ver_8tap_avx2,
+        convolve_2d_sr_ver_8tap_avx2
+      };
+  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
+  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
+
+  assert(tap_x != 12 && tap_y != 12);
+
+  const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride;
+  // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
+  //       permutation.
+  DECLARE_ALIGNED(32, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+  (void)conv_params;
+
+  assert(conv_params->round_0 == 3);
+  assert(conv_params->round_1 == 11);
+
+  // horizontal filter
+  int32_t hh = h + tap_y;
+  assert(!(hh % 2));
+
+  convolve_2d_sr_hor_tap_func_table[tap_x](
+      src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
+
+  // vertical filter
+  convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
+      im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
+}
+
+#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
diff --git a/third_party/SVT-AV1/convolve_avx2.h b/third_party/SVT-AV1/convolve_avx2.h
new file mode 100644
index 000000000..923cabee7
--- /dev/null
+++ b/third_party/SVT-AV1/convolve_avx2.h
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
+#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
+
+#include "EbMemory_AVX2.h"
+#include "EbMemory_SSE4_1.h"
+#include "synonyms.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+
+static INLINE void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
+                                             __m256i coeffs[2]) {
+  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
+}
+
+static INLINE void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
+                                             __m256i coeffs[3]) {
+  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
+}
+
+static INLINE void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
+                                             __m256i coeffs[4]) {
+  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_half_coeffs_2tap_ssse3(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [1] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+  // coeffs 3 4 3 4 3 4 3 4
+  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
+}
+
+static INLINE void prepare_half_coeffs_4tap_ssse3(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [2] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
+}
+
+static INLINE void prepare_half_coeffs_6tap_ssse3(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [3] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
+}
+
+static INLINE void prepare_half_coeffs_8tap_ssse3(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_half_coeffs_2tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [1] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 3 4 3 4 3 4 3 4
+  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+}
+
+static INLINE void prepare_half_coeffs_4tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [2] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
+}
+
+static INLINE void prepare_half_coeffs_6tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [3] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
+}
+
+static INLINE void prepare_half_coeffs_8tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
+}
+
+static INLINE void prepare_coeffs_2tap_sse2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [1] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+}
+
+static INLINE void prepare_coeffs_4tap_sse2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [2] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs_6tap_ssse3(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [3] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
+}
+
+static INLINE void prepare_coeffs_8tap_sse2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void prepare_coeffs_2tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [1] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+}
+
+static INLINE void prepare_coeffs_4tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [2] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs_6tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [3]*/) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
+}
+
+static INLINE void prepare_coeffs_8tap_avx2(
+    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
+                                         const ptrdiff_t stride,
+                                         __m256i dst[5]) {
+  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+}
+
+static INLINE void load_16bit_7rows_avx2(const int16_t *const src,
+                                         const ptrdiff_t stride,
+                                         __m256i dst[7]) {
+  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+}
+
+static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
+                                                   const ptrdiff_t stride,
+                                                   __m256i dst[8]) {
+  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+}
+
+static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
+    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
+    __m256i ss_256[5], __m256i tt_256[5]) {
+  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+
+  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+
+  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
+  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
+}
+
+static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
+    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
+    __m256i ss_256[3], __m256i tt_256[3]) {
+  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+
+  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+
+  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+}
+
+static INLINE void convolve_8tap_unpack_avx2(const __m256i s[6],
+                                             __m256i ss[7]) {
+  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
+  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
+  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
+  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
+  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
+  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
+}
+
+static INLINE __m128i convolve_2tap_ssse3(const __m128i ss[1],
+                                          const __m128i coeffs[1]) {
+  return _mm_maddubs_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m128i convolve_4tap_ssse3(const __m128i ss[2],
+                                          const __m128i coeffs[2]) {
+  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
+  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
+  return _mm_add_epi16(res_23, res_45);
+}
+
+static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
+                                          const __m128i coeffs[3]) {
+  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
+  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
+  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
+  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
+  return _mm_add_epi16(res_1256, res_34);
+}
+
+static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
+                                          const __m128i coeffs[4]) {
+  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
+  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
+  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
+  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
+  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
+  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
+  return _mm_add_epi16(res_0145, res_2367);
+}
+
+static INLINE __m256i convolve_2tap_avx2(const __m256i ss[1],
+                                         const __m256i coeffs[1]) {
+  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m256i convolve_4tap_avx2(const __m256i ss[2],
+                                         const __m256i coeffs[2]) {
+  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
+  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
+  return _mm256_add_epi16(res_23, res_45);
+}
+
+static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
+                                         const __m256i coeffs[3]) {
+  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
+  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
+  return _mm256_add_epi16(res_0145, res_23);
+}
+
+static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
+                                         const __m256i coeffs[4]) {
+  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
+  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
+  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
+  return _mm256_add_epi16(res_0145, res_2367);
+}
+
+static INLINE __m128i convolve16_2tap_sse2(const __m128i ss[1],
+                                           const __m128i coeffs[1]) {
+  return _mm_madd_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m128i convolve16_4tap_sse2(const __m128i ss[2],
+                                           const __m128i coeffs[2]) {
+  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
+  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
+  return _mm_add_epi32(res_01, res_23);
+}
+
+static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
+                                           const __m128i coeffs[3]) {
+  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
+  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
+  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
+  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
+  return _mm_add_epi32(res_0123, res_45);
+}
+
+static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
+                                           const __m128i coeffs[4]) {
+  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
+  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
+  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
+  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
+  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
+  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
+  return _mm_add_epi32(res_0123, res_4567);
+}
+
+static INLINE __m256i convolve16_2tap_avx2(const __m256i ss[1],
+                                           const __m256i coeffs[1]) {
+  return _mm256_madd_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m256i convolve16_4tap_avx2(const __m256i ss[2],
+                                           const __m256i coeffs[2]) {
+  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
+  return _mm256_add_epi32(res_1, res_2);
+}
+
+static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
+                                           const __m256i coeffs[3]) {
+  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
+  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
+  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
+  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
+  return _mm256_add_epi32(res_0123, res_45);
+}
+
+static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
+                                           const __m256i coeffs[4]) {
+  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
+  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
+  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
+  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
+  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
+  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
+  return _mm256_add_epi32(res_0123, res_4567);
+}
+
+static INLINE __m256i x_convolve_4tap_avx2(const __m256i data,
+                                           const __m256i coeffs[2],
+                                           const __m256i filt[2]) {
+  __m256i ss[2];
+
+  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
+  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+  return convolve_4tap_avx2(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
+                                           const __m256i coeffs[3],
+                                           const __m256i filt[3]) {
+  __m256i ss[3];
+
+  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
+  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
+  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+  return convolve_6tap_avx2(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
+                                           const __m256i coeffs[4],
+                                           const __m256i filt[4]) {
+  __m256i ss[4];
+
+  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
+  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
+  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
+  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_8tap_avx2(ss, coeffs);
+}
+
+static INLINE __m256i sr_y_round_avx2(const __m256i src) {
+  const __m256i round = _mm256_set1_epi16(32);
+  const __m256i dst = _mm256_add_epi16(src, round);
+  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
+}
+
+static INLINE __m128i xy_x_round_sse2(const __m128i src) {
+  const __m128i round = _mm_set1_epi16(2);
+  const __m128i dst = _mm_add_epi16(src, round);
+  return _mm_srai_epi16(dst, 2);
+}
+
+static INLINE __m256i xy_x_round_avx2(const __m256i src) {
+  const __m256i round = _mm256_set1_epi16(2);
+  const __m256i dst = _mm256_add_epi16(src, round);
+  return _mm256_srai_epi16(dst, 2);
+}
+
+static INLINE void xy_x_round_store_2x2_sse2(const __m128i res,
+                                             int16_t *const dst) {
+  const __m128i d = xy_x_round_sse2(res);
+  _mm_storel_epi64((__m128i *)dst, d);
+}
+
+static INLINE void xy_x_round_store_4x2_sse2(const __m128i res,
+                                             int16_t *const dst) {
+  const __m128i d = xy_x_round_sse2(res);
+  _mm_storeu_si128((__m128i *)dst, d);
+}
+
+static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
+                                             int16_t *const dst) {
+  __m128i r[2];
+
+  r[0] = xy_x_round_sse2(res[0]);
+  r[1] = xy_x_round_sse2(res[1]);
+  _mm_storeu_si128((__m128i *)dst, r[0]);
+  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
+}
+
+static INLINE void xy_x_round_store_8x2_avx2(const __m256i res,
+                                             int16_t *const dst) {
+  const __m256i d = xy_x_round_avx2(res);
+  _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
+                                            int16_t *const dst) {
+  __m256i r[2];
+
+  r[0] = xy_x_round_avx2(res[0]);
+  r[1] = xy_x_round_avx2(res[1]);
+  const __m256i d0 =
+      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
+  const __m256i d1 =
+      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
+  _mm256_storeu_si256((__m256i *)dst, d0);
+  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE __m128i xy_y_round_sse2(const __m128i src) {
+  const __m128i round = _mm_set1_epi32(1024);
+  const __m128i dst = _mm_add_epi32(src, round);
+  return _mm_srai_epi32(dst, 11);
+}
+
+static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
+  const __m128i round = _mm_set1_epi16(16);
+  const __m128i dst = _mm_add_epi16(src, round);
+  return _mm_srai_epi16(dst, 5);
+}
+
+static INLINE __m256i xy_y_round_avx2(const __m256i src) {
+  const __m256i round = _mm256_set1_epi32(1024);
+  const __m256i dst = _mm256_add_epi32(src, round);
+  return _mm256_srai_epi32(dst, 11);
+}
+
+static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
+  const __m256i r0 = xy_y_round_avx2(r[0]);
+  const __m256i r1 = xy_y_round_avx2(r[1]);
+  return _mm256_packs_epi32(r0, r1);
+}
+
+static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
+  const __m256i round = _mm256_set1_epi16(16);
+  const __m256i dst = _mm256_add_epi16(src, round);
+  return _mm256_srai_epi16(dst, 5);
+}
+
+static INLINE void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
+                                       const ptrdiff_t stride) {
+  const __m128i d = _mm_packus_epi16(res, res);
+  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
+  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
+}
+
+static INLINE void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
+                                       const ptrdiff_t stride) {
+  const __m128i d = _mm_packus_epi16(res, res);
+  store_u8_4x2_sse2(d, dst, stride);
+}
+
+static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
+                                       const ptrdiff_t stride) {
+  const __m256i d = _mm256_packus_epi16(res, res);
+  const __m128i d0 = _mm256_castsi256_si128(d);
+  const __m128i d1 = _mm256_extracti128_si256(d, 1);
+
+  xx_storel_32(dst, d0);
+  xx_storel_32(dst + stride, d1);
+}
+
+static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
+                                       const ptrdiff_t stride) {
+  const __m256i d = _mm256_packus_epi16(res, res);
+  const __m128i d0 = _mm256_castsi256_si128(d);
+  const __m128i d1 = _mm256_extracti128_si256(d, 1);
+  _mm_storel_epi64((__m128i *)dst, d0);
+  _mm_storel_epi64((__m128i *)(dst + stride), d1);
+}
+
+static INLINE void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
+                                        uint8_t *const dst,
+                                        const ptrdiff_t stride) {
+  const __m256i d = _mm256_packus_epi16(res0, res1);
+  storeu_u8_16x2_avx2(d, dst, stride);
+}
+
+static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
+                                             const __m256i res1,
+                                             uint8_t *const dst,
+                                             const ptrdiff_t stride) {
+  const __m256i t = _mm256_packus_epi16(res0, res1);
+  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
+  storeu_u8_16x2_avx2(d, dst, stride);
+}
+
+static INLINE void pack_store_32_avx2(const __m256i res0, const __m256i res1,
+                                      uint8_t *const dst) {
+  const __m256i t = _mm256_packus_epi16(res0, res1);
+  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
+  _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
+                                             uint8_t *const dst,
+                                             const ptrdiff_t stride) {
+  const __m128i r = xy_y_round_sse2(res);
+  const __m128i rr = _mm_packs_epi32(r, r);
+  pack_store_2x2_sse2(rr, dst, stride);
+}
+
+static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
+                                             uint8_t *const dst,
+                                             const ptrdiff_t stride) {
+  const __m256i r = xy_y_round_avx2(res);
+  const __m256i rr = _mm256_packs_epi32(r, r);
+  pack_store_4x2_avx2(rr, dst, stride);
+}
+
+static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
+                                           const __m256i res1,
+                                           uint8_t *const dst) {
+  const __m256i d = _mm256_packus_epi16(res0, res1);
+  // d = _mm256_permute4x64_epi64(d, 0xD8);
+  _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
+                                            const __m256i r1[2],
+                                            uint8_t *const dst) {
+  const __m256i ra = xy_y_round_16_avx2(r0);
+  const __m256i rb = xy_y_round_16_avx2(r1);
+  xy_y_pack_store_32_avx2(ra, rb, dst);
+}
+
+static INLINE void convolve_store_32_avx2(const __m256i res0,
+                                          const __m256i res1,
+                                          uint8_t *const dst) {
+  const __m256i d = _mm256_packus_epi16(res0, res1);
+  _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE __m128i sr_x_round_sse2(const __m128i src) {
+  const __m128i round = _mm_set1_epi16(34);
+  const __m128i dst = _mm_add_epi16(src, round);
+  return _mm_srai_epi16(dst, 6);
+}
+
+static INLINE __m256i sr_x_round_avx2(const __m256i src) {
+  const __m256i round = _mm256_set1_epi16(34);
+  const __m256i dst = _mm256_add_epi16(src, round);
+  return _mm256_srai_epi16(dst, 6);
+}
+
+static INLINE __m128i sr_y_round_sse2(const __m128i src) {
+  const __m128i round = _mm_set1_epi16(32);
+  const __m128i dst = _mm_add_epi16(src, round);
+  return _mm_srai_epi16(dst, FILTER_BITS - 1);
+}
+
+static INLINE void sr_x_round_store_8x2_avx2(const __m256i res,
+                                             uint8_t *const dst,
+                                             const ptrdiff_t dst_stride) {
+  const __m256i r = sr_x_round_avx2(res);
+  pack_store_8x2_avx2(r, dst, dst_stride);
+}
+
+static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
+                                              uint8_t *const dst,
+                                              const ptrdiff_t dst_stride) {
+  __m256i r[2];
+
+  r[0] = sr_x_round_avx2(res[0]);
+  r[1] = sr_x_round_avx2(res[1]);
+  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
+}
+
+static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
+                                            uint8_t *const dst) {
+  __m256i r[2];
+
+  r[0] = sr_x_round_avx2(res[0]);
+  r[1] = sr_x_round_avx2(res[1]);
+  convolve_store_32_avx2(r[0], r[1], dst);
+}
+
+static INLINE void sr_y_round_store_8x2_avx2(const __m256i res,
+                                             uint8_t *const dst,
+                                             const ptrdiff_t dst_stride) {
+  const __m256i r = sr_y_round_avx2(res);
+  pack_store_8x2_avx2(r, dst, dst_stride);
+}
+
+static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
+                                              uint8_t *const dst,
+                                              const ptrdiff_t dst_stride) {
+  __m256i r[2];
+
+  r[0] = sr_y_round_avx2(res[0]);
+  r[1] = sr_y_round_avx2(res[1]);
+  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
+}
+
+static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
+                                         const __m256i s0, __m256i *const s1,
+                                         uint8_t *const dst) {
+  *s1 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i d = _mm256_avg_epu8(s0, *s1);
+  _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
+                                         uint8_t *const dst) {
+  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
+  const __m256i d = _mm256_avg_epu8(s0, s1);
+  _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
+                                                 const ptrdiff_t stride,
+                                                 const __m128i coeffs[1]) {
+  const __m128i sfl =
+      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
+  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
+  return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[1]) {
+  const __m128i sfl =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
+  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
+  return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
+                                             const ptrdiff_t stride,
+                                             const __m128i coeffs[1],
+                                             __m128i r[2]) {
+  __m128i ss[2];
+  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
+  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
+  const __m128i s01 = _mm_srli_si128(s00, 1);
+  const __m128i s11 = _mm_srli_si128(s10, 1);
+  ss[0] = _mm_unpacklo_epi8(s00, s01);
+  ss[1] = _mm_unpacklo_epi8(s10, s11);
+
+  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
+  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
+}
+
+static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[1]) {
+  __m128i s_128[2][2];
+  __m256i s_256[2];
+
+  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
+  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
+  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
+  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
+  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
+  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
+  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+  return convolve_2tap_avx2(&ss, coeffs);
+}
+
+static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
+                                             const ptrdiff_t stride,
+                                             const __m256i coeffs[1],
+                                             __m256i r[2]) {
+  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
+  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
+  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
+  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
+  r[0] = convolve_2tap_avx2(&s0, coeffs);
+  r[1] = convolve_2tap_avx2(&s1, coeffs);
+}
+
+static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
+                                           const __m256i coeffs[1],
+                                           __m256i r[2]) {
+  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
+  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
+
+  r[0] = convolve_2tap_avx2(&ss0, coeffs);
+  r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[2]) {
+  const __m128i sfl0 =
+      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i sfl1 =
+      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i s = load_u8_8x2_sse2(src, stride);
+  __m128i ss[2];
+
+  ss[0] = _mm_shuffle_epi8(s, sfl0);
+  ss[1] = _mm_shuffle_epi8(s, sfl1);
+  return convolve_4tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[2]) {
+  const __m128i s = load_u8_8x2_sse2(src, stride);
+  const __m128i sfl0 =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+  const __m128i sfl1 =
+      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
+  __m128i ss[2];
+
+  ss[0] = _mm_shuffle_epi8(s, sfl0);
+  ss[1] = _mm_shuffle_epi8(s, sfl1);
+  return convolve_4tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[2],
+                                               const __m256i filt[2]) {
+  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
+  return x_convolve_4tap_avx2(s_256, coeffs, filt);
+}
+
+static INLINE void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
+                                             const int32_t src_stride,
+                                             const __m256i coeffs[2],
+                                             const __m256i filt[2],
+                                             __m256i r[2]) {
+  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
+  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
+}
+
+static INLINE void x_convolve_4tap_32_avx2(const uint8_t *const src,
+                                           const __m256i coeffs[2],
+                                           const __m256i filt[2],
+                                           __m256i r[2]) {
+  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
+
+  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
+  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
+}
+
+static INLINE __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[3]) {
+  const __m128i sfl0 =
+      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i sfl1 =
+      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i sfl2 =
+      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
+
+  const __m128i s = load_u8_8x2_sse2(src, stride);
+  __m128i ss[3];
+
+  ss[0] = _mm_shuffle_epi8(s, sfl0);
+  ss[1] = _mm_shuffle_epi8(s, sfl1);
+  ss[2] = _mm_shuffle_epi8(s, sfl2);
+  return convolve_6tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[3]) {
+  const __m128i s = load_u8_8x2_sse2(src, stride);
+  const __m128i sfl0 =
+      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i sfl1 =
+      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i sfl2 =
+      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
+  __m128i ss[3];
+
+  ss[0] = _mm_shuffle_epi8(s, sfl0);
+  ss[1] = _mm_shuffle_epi8(s, sfl1);
+  ss[2] = _mm_shuffle_epi8(s, sfl2);
+  return convolve_6tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[3],
+                                               const __m256i filt[3]) {
+  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
+  return x_convolve_6tap_avx2(s_256, coeffs, filt);
+}
+
+static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
+                                             const int32_t src_stride,
+                                             const __m256i coeffs[3],
+                                             const __m256i filt[3],
+                                             __m256i r[2]) {
+  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
+  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
+}
+
+static INLINE void x_convolve_6tap_32_avx2(const uint8_t *const src,
+                                           const __m256i coeffs[3],
+                                           const __m256i filt[3],
+                                           __m256i r[2]) {
+  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
+
+  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
+  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
+}
+
+static INLINE __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[4],
+                                               const __m256i filt[4]) {
+  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
+  return x_convolve_8tap_avx2(s_256, coeffs, filt);
+}
+
+static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
+                                                       const int32_t src_stride,
+                                                       const __m256i coeffs[4],
+                                                       const __m256i filt[4],
+                                                       __m256i r[2]) {
+  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
+  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
+}
+
+static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
+                                                     const __m256i coeffs[4],
+                                                     const __m256i filt[4],
+                                                     __m256i r[2]) {
+  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
+
+  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
+  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
+}
+
+static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[1],
+                                                __m128i s_16[2]) {
+  __m128i s_128[2];
+
+  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
+  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
+  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
+  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
+  return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[1],
+                                                __m128i s_32[2]) {
+  __m128i s_128[2];
+
+  s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
+  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+  s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
+  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
+  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
+  return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[1],
+                                               __m128i s_64[2]) {
+  __m256i s_256[2];
+
+  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
+  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
+  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
+  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+  return convolve_2tap_avx2(&ss, coeffs);
+}
+
+static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
+                                             const ptrdiff_t stride,
+                                             const __m256i coeffs[1],
+                                             __m128i s_128[2], __m256i r[2]) {
+  __m256i s_256[2];
+
+  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
+  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
+  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
+  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
+  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+  r[0] = convolve_2tap_avx2(&ss0, coeffs);
+  r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
+                                           const __m256i coeffs[1],
+                                           const __m256i s0, __m256i *const s1,
+                                           __m256i r[2]) {
+  *s1 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
+  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
+  r[0] = convolve_2tap_avx2(&ss0, coeffs);
+  r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[2],
+                                                __m128i s_16[4],
+                                                __m128i ss_128[2]) {
+  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
+  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
+  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
+  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
+  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+  return convolve_4tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[2],
+                                                __m128i s_32[4],
+                                                __m128i ss_128[2]) {
+  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
+  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
+  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
+  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+  return convolve_4tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[2],
+                                               __m128i s_64[4],
+                                               __m256i ss_256[2]) {
+  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
+  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
+  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
+  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
+  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+  return convolve_4tap_avx2(ss_256, coeffs);
+}
+
+static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
+                                             const ptrdiff_t stride,
+                                             const __m256i coeffs[2],
+                                             __m128i s_128[4],
+                                             __m256i ss_256[4], __m256i r[2]) {
+  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
+  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
+  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
+  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
+  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
+  r[0] = convolve_4tap_avx2(ss_256, coeffs);
+  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
+}
+
+static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[3],
+                                                __m128i s_16[6],
+                                                __m128i ss_128[3]) {
+  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
+  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
+  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
+  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
+  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+  return convolve_6tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE void y_convolve_4tap_32x2_avx2(
+    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
+    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
+  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
+  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
+  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
+  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
+  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
+  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
+  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
+  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
+}
+
+static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[3],
+                                                __m128i s_32[6],
+                                                __m128i ss_128[3]) {
+  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
+  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
+  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
+  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+  return convolve_6tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[3],
+                                               __m128i s_64[6],
+                                               __m256i ss_256[3]) {
+  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
+  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
+  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
+  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
+  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+  return convolve_6tap_avx2(ss_256, coeffs);
+}
+
+static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
+                                             const ptrdiff_t stride,
+                                             const __m256i coeffs[3],
+                                             __m128i s_128[6],
+                                             __m256i ss_256[6], __m256i r[2]) {
+  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
+  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
+  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
+  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
+  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
+  r[0] = convolve_6tap_avx2(ss_256, coeffs);
+  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
+}
+
+static INLINE void y_convolve_6tap_32x2_avx2(
+    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
+    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
+  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
+  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
+  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
+  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
+  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
+  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
+  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
+  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
+}
+
+static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[4],
+                                                __m128i s_16[8],
+                                                __m128i ss_128[4]) {
+  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
+  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
+  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
+  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
+  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
+  return convolve_8tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
+                                                const ptrdiff_t stride,
+                                                const __m128i coeffs[4],
+                                                __m128i s_32[8],
+                                                __m128i ss_128[4]) {
+  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
+  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
+  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
+  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
+  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
+  return convolve_8tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
+                                               const ptrdiff_t stride,
+                                               const __m256i coeffs[4],
+                                               __m128i s_64[8],
+                                               __m256i ss_256[4]) {
+  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
+  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
+  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
+  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
+  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
+  return convolve_8tap_avx2(ss_256, coeffs);
+}
+
+static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
+                                             const ptrdiff_t stride,
+                                             const __m256i coeffs[4],
+                                             __m128i s_128[8],
+                                             __m256i ss_256[8], __m256i r[2]) {
+  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
+  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
+  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
+  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
+  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
+  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
+  r[0] = convolve_8tap_avx2(ss_256, coeffs);
+  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
+}
+
+static INLINE void y_convolve_8tap_32x2_avx2(
+    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
+    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
+  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
+  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
+  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
+  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
+  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
+  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
+  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
+  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
+  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
+}
+
+static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
+                                              const __m256i coeffs[1],
+                                              __m256i r[2]) {
+  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
+  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
+
+  r[0] = convolve_2tap_avx2(&ss0, coeffs);
+  r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[1],
+                                     int16_t *const dst) {
+  __m256i r[2];
+
+  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
+  const __m256i d0 = xy_x_round_avx2(r[0]);
+  const __m256i d1 = xy_x_round_avx2(r[1]);
+  _mm256_storeu_si256((__m256i *)dst, d0);
+  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE void xy_x_4tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[2],
+                                     const __m256i filt[2],
+                                     int16_t *const dst) {
+  __m256i r[2];
+
+  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
+  const __m256i d0 = xy_x_round_avx2(r[0]);
+  const __m256i d1 = xy_x_round_avx2(r[1]);
+  _mm256_storeu_si256((__m256i *)dst, d0);
+  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[3],
+                                     const __m256i filt[3],
+                                     int16_t *const dst) {
+  __m256i r[2];
+
+  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
+  const __m256i d0 = xy_x_round_avx2(r[0]);
+  const __m256i d1 = xy_x_round_avx2(r[1]);
+  _mm256_storeu_si256((__m256i *)dst, d0);
+  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[4],
+                                     const __m256i filt[4],
+                                     int16_t *const dst) {
+  __m256i r[2];
+
+  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
+  const __m256i d0 = xy_x_round_avx2(r[0]);
+  const __m256i d1 = xy_x_round_avx2(r[1]);
+  _mm256_storeu_si256((__m256i *)dst, d0);
+  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
+                                                  __m128i s_32[2],
+                                                  const __m128i coeffs[1]) {
+  __m128i s_128[2];
+
+  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
+  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
+  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
+  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
+  return convolve16_2tap_sse2(&ss, coeffs);
+}
+
+static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
+    const int16_t *const src, __m128i s_32[2]) {
+  __m128i s_128[2];
+
+  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
+  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
+  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
+  return _mm_add_epi16(s_128[0], s_128[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
+                                               __m128i s_64[2],
+                                               const __m128i coeffs[1],
+                                               __m128i r[2]) {
+  __m128i s_128[2];
+
+  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
+  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
+  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
+  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
+  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
+  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
+  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
+  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
+}
+
+static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
+    const int16_t *const src, __m128i s_64[2]) {
+  __m128i s_128[2];
+
+  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
+  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
+  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
+  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
+  return _mm_add_epi16(s_128[0], s_128[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
+                                              const __m256i s1,
+                                              const __m256i coeffs[1],
+                                              __m256i r[2]) {
+  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
+  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
+  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
+                                               __m128i s_128[2],
+                                               const __m256i coeffs[1],
+                                               __m256i r[2]) {
+  __m256i s_256[2];
+  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
+  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
+  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
+  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
+}
+
+static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
+    const int16_t *const src, __m128i s_128[2]) {
+  __m256i s_256[2];
+  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
+  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
+  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
+  return _mm256_add_epi16(s_256[0], s_256[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
+    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
+  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
+  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
+}
+
+static INLINE void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
+                                        const ptrdiff_t stride) {
+  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
+  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
+  storeu_u8_16x2_avx2(d, dst, stride);
+}
+
+static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
+                                                __m256i s[2],
+                                                const __m256i coeffs[1],
+                                                __m256i r[4]) {
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
+}
+
+static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
+                                              const __m256i s0[2],
+                                              __m256i s1[2],
+                                              const __m256i coeffs[1],
+                                              __m256i r[4]) {
+  s1[0] = _mm256_loadu_si256((__m256i *)src);
+  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
+  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
+}
+
+static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
+                                                  const __m256i s0[2],
+                                                  __m256i s1[2],
+                                                  const __m256i coeffs[1],
+                                                  uint8_t *const dst) {
+  __m256i r[4];
+
+  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
+  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
+}
+
+static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
+                                                       const __m256i s0[2],
+                                                       __m256i s1[2],
+                                                       __m256i r[2]) {
+  s1[0] = _mm256_loadu_si256((__m256i *)src);
+  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+  r[0] = _mm256_add_epi16(s0[0], s1[0]);
+  r[1] = _mm256_add_epi16(s0[1], s1[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
+    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
+    uint8_t *const dst) {
+  __m256i r[2];
+
+  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
+  r[0] = xy_y_round_half_pel_avx2(r[0]);
+  r[1] = xy_y_round_half_pel_avx2(r[1]);
+  xy_y_pack_store_32_avx2(r[0], r[1], dst);
+}
+
+static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
+                                                  __m128i s_32[4],
+                                                  __m128i ss_128[2],
+                                                  const __m128i coeffs[2]) {
+  s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
+  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+  s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
+  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
+  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
+  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
+  ss_128[0] = ss_128[1];
+  return r;
+}
+
+static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
+                                                  __m128i s_64[4],
+                                                  __m256i ss_256[2],
+                                                  const __m256i coeffs[2]) {
+  __m256i s_256[2];
+  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
+  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
+  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
+  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
+  ss_256[0] = ss_256[1];
+  return r;
+}
+
+static INLINE void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
+                                              const __m256i coeffs[2],
+                                              __m256i r[2]) {
+  r[0] = convolve16_4tap_avx2(ss, coeffs);
+  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
+}
+
+static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
+                                               __m256i ss_256[4],
+                                               const __m256i coeffs[2],
+                                               __m256i r[2]) {
+  __m256i s_256[2];
+  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
+  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
+  ss_256[0] = ss_256[1];
+  ss_256[2] = ss_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
+    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
+    __m256i r[2]) {
+  __m256i a_256[2];
+  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
+  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
+  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
+  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
+  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
+  s_256[0] = s_256[2];
+  s_256[1] = s_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_16x2_avx2(
+    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
+    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
+  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
+  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
+  ss_256[0] = ss_256[1];
+  ss_256[2] = ss_256[3];
+  tt_256[0] = tt_256[1];
+  tt_256[2] = tt_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_32x2_avx2(
+    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
+    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
+    __m256i r[4]) {
+  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
+  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
+  ss_256[0] = ss_256[1];
+  ss_256[2] = ss_256[3];
+  tt_256[0] = tt_256[1];
+  tt_256[2] = tt_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
+    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
+    __m256i r[4]) {
+  __m256i a_256[2];
+
+  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+
+  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
+  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
+  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
+
+  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
+  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
+  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
+
+  s_256[0] = s_256[2];
+  s_256[1] = s_256[3];
+  s_256[2] = s_256[4];
+}
+
+static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
+                                                  __m128i s_32[6],
+                                                  __m128i ss_128[3],
+                                                  const __m128i coeffs[3]) {
+  s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
+  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+  s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
+  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
+  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
+  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
+  ss_128[0] = ss_128[1];
+  ss_128[1] = ss_128[2];
+  return r;
+}
+
+static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
+                                                  __m128i s_64[6],
+                                                  __m256i ss_256[3],
+                                                  const __m256i coeffs[3]) {
+  __m256i s_256[2];
+  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
+  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
+  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
+  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
+  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
+  ss_256[0] = ss_256[1];
+  ss_256[1] = ss_256[2];
+  return r;
+}
+
+static INLINE void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
+                                              const __m256i coeffs[3],
+                                              __m256i r[2]) {
+  r[0] = convolve16_6tap_avx2(ss, coeffs);
+  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
+}
+
+static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
+                                               __m256i ss_256[6],
+                                               const __m256i coeffs[3],
+                                               __m256i r[2]) {
+  __m256i s_256[2];
+  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
+  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
+  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
+  ss_256[0] = ss_256[1];
+  ss_256[1] = ss_256[2];
+  ss_256[3] = ss_256[4];
+  ss_256[4] = ss_256[5];
+}
+
+static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
+    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
+    __m256i r[2]) {
+  __m256i a_256[2], ss_256[4];
+  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
+  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
+  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
+  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
+  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
+  s_256[0] = s_256[2];
+  s_256[1] = s_256[3];
+  s_256[2] = s_256[4];
+  s_256[3] = s_256[5];
+}
+
+static INLINE void xy_y_convolve_6tap_16x2_avx2(
+    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
+    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
+    __m256i r[4]) {
+  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
+  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
+  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
+  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
+
+  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
+  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
+
+  ss_256[0] = ss_256[1];
+  ss_256[1] = ss_256[2];
+  ss_256[3] = ss_256[4];
+  ss_256[4] = ss_256[5];
+
+  tt_256[0] = tt_256[1];
+  tt_256[1] = tt_256[2];
+  tt_256[3] = tt_256[4];
+  tt_256[4] = tt_256[5];
+}
+
+static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
+    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
+    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
+  __m256i a_256[2];
+
+  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
+  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
+  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+
+  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
+  s_256[0] = s_256[2];
+  s_256[2] = s_256[4];
+  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
+  s_256[1] = s_256[3];
+  s_256[3] = s_256[5];
+  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
+}
+
+static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
+                                                  __m128i s_32[8],
+                                                  __m128i ss_128[4],
+                                                  const __m128i coeffs[4]) {
+  s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
+  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
+  s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
+  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
+  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
+  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
+  ss_128[0] = ss_128[1];
+  ss_128[1] = ss_128[2];
+  ss_128[2] = ss_128[3];
+  return r;
+}
+
+static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
+                                                  __m128i s_64[8],
+                                                  __m256i ss_256[4],
+                                                  const __m256i coeffs[4]) {
+  __m256i s_256[2];
+  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
+  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
+  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
+  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
+  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
+  ss_256[0] = ss_256[1];
+  ss_256[1] = ss_256[2];
+  ss_256[2] = ss_256[3];
+  return r;
+}
+
+static INLINE void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
+                                              const __m256i coeffs[4],
+                                              __m256i r[2]) {
+  r[0] = convolve16_8tap_avx2(ss, coeffs);
+  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
+}
+
+static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
+                                               __m256i ss_256[8],
+                                               const __m256i coeffs[4],
+                                               __m256i r[2]) {
+  __m256i s_256[2];
+  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
+  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
+  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
+  ss_256[0] = ss_256[1];
+  ss_256[1] = ss_256[2];
+  ss_256[2] = ss_256[3];
+  ss_256[4] = ss_256[5];
+  ss_256[5] = ss_256[6];
+  ss_256[6] = ss_256[7];
+}
+
+static INLINE void xy_y_convolve_8tap_8x2_half_pel_avx2(
+    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
+    __m256i r[2]) {
+  __m256i a_256[4], ss_256[4];
+
+  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
+  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
+  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
+  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
+  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
+  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
+  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
+  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
+  s_256[0] = s_256[2];
+  s_256[1] = s_256[3];
+  s_256[2] = s_256[4];
+  s_256[3] = s_256[5];
+  s_256[4] = s_256[6];
+  s_256[5] = s_256[7];
+}
+
+static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
+    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
+    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
+  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
+  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
+  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
+  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
+  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
+
+  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
+  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
+
+  ss_256[0] = ss_256[1];
+  ss_256[1] = ss_256[2];
+  ss_256[2] = ss_256[3];
+  ss_256[4] = ss_256[5];
+  ss_256[5] = ss_256[6];
+  ss_256[6] = ss_256[7];
+
+  tt_256[0] = tt_256[1];
+  tt_256[1] = tt_256[2];
+  tt_256[2] = tt_256[3];
+  tt_256[4] = tt_256[5];
+  tt_256[5] = tt_256[6];
+  tt_256[6] = tt_256[7];
+}
+
+static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
+    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
+    __m256i s_256[8], __m256i r[4]) {
+  __m256i a_256[4], ss_256[4];
+  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+
+  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
+  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
+  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
+  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
+  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
+  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
+
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+
+  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
+  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
+  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
+  s_256[0] = s_256[2];
+  s_256[2] = s_256[4];
+  s_256[4] = s_256[6];
+  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
+
+  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
+  s_256[1] = s_256[3];
+  s_256[3] = s_256[5];
+  s_256[5] = s_256[7];
+  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
+  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
+
+  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
+}
+
+static INLINE void xy_y_round_store_8x2_avx2(const __m256i res[2],
+                                             uint8_t *const dst,
+                                             const ptrdiff_t stride) {
+  const __m256i r = xy_y_round_16_avx2(res);
+  pack_store_8x2_avx2(r, dst, stride);
+}
+
+static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
+                                              uint8_t *const dst,
+                                              const ptrdiff_t stride) {
+  const __m256i r0 = xy_y_round_16_avx2(res + 0);
+  const __m256i r1 = xy_y_round_16_avx2(res + 2);
+  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
+}
+
+static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
+                                            uint8_t *const dst) {
+  __m256i r[2];
+
+  r[0] = sr_y_round_avx2(res[0]);
+  r[1] = sr_y_round_avx2(res[1]);
+  convolve_store_32_avx2(r[0], r[1], dst);
+}
+
+static INLINE void sr_y_round_store_32x2_avx2(const __m256i res[4],
+                                              uint8_t *const dst,
+                                              const int32_t dst_stride) {
+  sr_y_round_store_32_avx2(res, dst);
+  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
+}
+
+static INLINE void sr_y_2tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[1], const __m256i s0,
+                                     __m256i *const s1, uint8_t *const dst) {
+  __m256i r[2];
+  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
+  sr_y_round_store_32_avx2(r, dst);
+}
+
+static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
+    const int32_t subpel_y_q4) {
+  int32_t x, y;
+  __m128i coeffs_128[4];
+  __m256i coeffs_256[4];
+
+  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+  if (vert_tap == 2) {
+    // vert_filt as 2 tap
+    const uint8_t *src_ptr = src;
+
+    y = h;
+
+    if (subpel_y_q4 != 8) {
+      if (w <= 8) {
+        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
+                                       coeffs_128);
+
+        if (w == 2) {
+          __m128i s_16[2];
+
+          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
+
+          do {
+            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
+                                                          coeffs_128, s_16);
+            const __m128i r = sr_y_round_sse2(res);
+            pack_store_2x2_sse2(r, dst, dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else if (w == 4) {
+          __m128i s_32[2];
+
+          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
+
+          do {
+            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
+                                                          coeffs_128, s_32);
+            const __m128i r = sr_y_round_sse2(res);
+            pack_store_4x2_sse2(r, dst, dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else {
+          __m128i s_64[2], s_128[2];
+
+          assert(w == 8);
+
+          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
+
+          do {
+            // Note: Faster than binding to AVX2 registers.
+            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
+            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
+            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
+            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
+            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
+            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
+            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
+            const __m128i r0 = sr_y_round_sse2(res0);
+            const __m128i r1 = sr_y_round_sse2(res1);
+            const __m128i d = _mm_packus_epi16(r0, r1);
+            _mm_storel_epi64((__m128i *)dst, d);
+            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        }
+      } else {
+        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+        if (w == 16) {
+          __m128i s_128[2];
+
+          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
+
+          do {
+            __m256i r[2];
+
+            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+                                      r);
+            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else if (w == 32) {
+          __m256i s_256[2];
+
+          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
+
+          do {
+            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
+                              &s_256[1], dst);
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
+                              &s_256[0], dst + dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else if (w == 64) {
+          __m256i s_256[2][2];
+
+          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+
+          do {
+            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
+                              &s_256[1][0], dst);
+            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
+                              s_256[0][1], &s_256[1][1], dst + 32);
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
+                              &s_256[0][0], dst + dst_stride);
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
+                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
+
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else {
+          __m256i s_256[2][4];
+
+          assert(w == 128);
+
+          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
+          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
+
+          do {
+            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
+                              &s_256[1][0], dst);
+            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
+                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
+            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
+                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
+            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
+                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
+
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
+                              &s_256[0][0], dst + dst_stride);
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
+                              s_256[1][1], &s_256[0][1],
+                              dst + dst_stride + 1 * 32);
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
+                              s_256[1][2], &s_256[0][2],
+                              dst + dst_stride + 2 * 32);
+            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
+                              s_256[1][3], &s_256[0][3],
+                              dst + dst_stride + 3 * 32);
+
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        }
+      }
+    } else {
+      // average to get half pel
+      if (w <= 8) {
+        if (w == 2) {
+          __m128i s_16[2];
+
+          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
+
+          do {
+            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
+            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
+            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
+            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
+            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
+            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else if (w == 4) {
+          __m128i s_32[2];
+
+          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
+
+          do {
+            s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + src_stride));
+            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
+            xx_storel_32(dst, d0);
+            s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
+            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
+            xx_storel_32(dst + dst_stride, d1);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else {
+          __m128i s_64[2];
+
+          assert(w == 8);
+
+          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
+
+          do {
+            // Note: Faster than binding to AVX2 registers.
+            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
+            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
+            _mm_storel_epi64((__m128i *)dst, d0);
+            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
+            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        }
+      } else if (w == 16) {
+        __m128i s_128[2];
+
+        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
+
+        do {
+          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
+          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
+          _mm_storeu_si128((__m128i *)dst, d0);
+          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
+          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 32) {
+        __m256i s_256[2];
+
+        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
+
+        do {
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
+                                dst + dst_stride);
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 64) {
+        __m256i s_256[2][2];
+
+        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+
+        do {
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
+                                dst);
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
+                                &s_256[1][1], dst + 32);
+
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
+                                &s_256[0][0], dst + dst_stride);
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
+                                &s_256[0][1], dst + dst_stride + 32);
+
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m256i s_256[2][4];
+
+        assert(w == 128);
+
+        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
+        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
+
+        do {
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
+                                dst);
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
+                                &s_256[1][1], dst + 1 * 32);
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
+                                &s_256[1][2], dst + 2 * 32);
+          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
+                                &s_256[1][3], dst + 3 * 32);
+
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
+                                &s_256[0][0], dst + dst_stride);
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
+                                &s_256[0][1], dst + dst_stride + 1 * 32);
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
+                                &s_256[0][2], dst + dst_stride + 2 * 32);
+          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
+                                &s_256[0][3], dst + dst_stride + 3 * 32);
+
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    }
+  } else if (vert_tap == 4) {
+    // vert_filt as 4 tap
+    const uint8_t *src_ptr = src - src_stride;
+
+    y = h;
+
+    if (w <= 4) {
+      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+      if (w == 2) {
+        __m128i s_16[4], ss_128[2];
+
+        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+
+        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
+
+        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+
+        do {
+          src_ptr += 2 * src_stride;
+          const __m128i res = y_convolve_4tap_2x2_ssse3(
+              src_ptr, src_stride, coeffs_128, s_16, ss_128);
+          const __m128i r = sr_y_round_sse2(res);
+          pack_store_2x2_sse2(r, dst, dst_stride);
+
+          ss_128[0] = ss_128[1];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m128i s_32[4], ss_128[2];
+
+        assert(w == 4);
+
+        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+
+        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+
+        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+
+        do {
+          src_ptr += 2 * src_stride;
+          const __m128i res = y_convolve_4tap_4x2_ssse3(
+              src_ptr, src_stride, coeffs_128, s_32, ss_128);
+          const __m128i r = sr_y_round_sse2(res);
+          pack_store_4x2_sse2(r, dst, dst_stride);
+
+          ss_128[0] = ss_128[1];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else {
+      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+      if (w == 8) {
+        __m128i s_64[4];
+        __m256i ss_256[2];
+
+        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
+        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
+        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
+        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
+
+        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+
+        do {
+          src_ptr += 2 * src_stride;
+          const __m256i res = y_convolve_4tap_8x2_avx2(
+              src_ptr, src_stride, coeffs_256, s_64, ss_256);
+          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
+
+          ss_256[0] = ss_256[1];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 16) {
+        __m128i s_128[4];
+        __m256i ss_256[4], r[2];
+
+        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
+        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
+        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
+        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
+
+        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
+
+        do {
+          src_ptr += 2 * src_stride;
+          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+                                    ss_256, r);
+          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+          ss_256[0] = ss_256[1];
+          ss_256[2] = ss_256[3];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 32) {
+        // AV1 standard won't have 32x4 case.
+        // This only favors some optimization feature which
+        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
+
+        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
+
+        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
+        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
+        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
+
+        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+
+        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+
+        do {
+          src_ptr += 2 * src_stride;
+          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
+                                    ss_256, tt_256, r);
+          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
+
+          ss_256[0] = ss_256[1];
+          ss_256[2] = ss_256[3];
+
+          tt_256[0] = tt_256[1];
+          tt_256[2] = tt_256[3];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        assert(!(w % 32));
+
+        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
+        x = 0;
+        do {
+          const uint8_t *s = src_ptr + x;
+          uint8_t *d = dst + x;
+          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
+          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
+          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
+
+          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+
+          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+
+          y = h;
+          do {
+            s += 2 * src_stride;
+            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
+                                      tt_256, r);
+            sr_y_round_store_32x2_avx2(r, d, dst_stride);
+
+            ss_256[0] = ss_256[1];
+            ss_256[2] = ss_256[3];
+
+            tt_256[0] = tt_256[1];
+            tt_256[2] = tt_256[3];
+            d += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+          x += 32;
+        } while (x < w);
+      }
+    }
+  } else if (vert_tap == 6) {
+    // vert_filt as 6 tap
+    const uint8_t *src_ptr = src - 2 * src_stride;
+
+    if (w <= 4) {
+      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+      y = h;
+
+      if (w == 2) {
+        __m128i s_16[6], ss_128[3];
+
+        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
+        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
+
+        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
+        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
+        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
+
+        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+
+        do {
+          src_ptr += 2 * src_stride;
+          const __m128i res = y_convolve_6tap_2x2_ssse3(
+              src_ptr, src_stride, coeffs_128, s_16, ss_128);
+          const __m128i r = sr_y_round_sse2(res);
+          pack_store_2x2_sse2(r, dst, dst_stride);
+
+          ss_128[0] = ss_128[1];
+          ss_128[1] = ss_128[2];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m128i s_32[6], ss_128[3];
+
+        assert(w == 4);
+
+        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
+        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
+
+        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+
+        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+
+        do {
+          src_ptr += 2 * src_stride;
+          const __m128i res = y_convolve_6tap_4x2_ssse3(
+              src_ptr, src_stride, coeffs_128, s_32, ss_128);
+          const __m128i r = sr_y_round_sse2(res);
+          pack_store_4x2_sse2(r, dst, dst_stride);
+
+          ss_128[0] = ss_128[1];
+          ss_128[1] = ss_128[2];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else {
+      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+      if (w == 8) {
+        __m128i s_64[6];
+        __m256i ss_256[3];
+
+        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
+        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
+        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
+        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
+
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
+        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
+        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
+        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
+
+        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+
+        y = h;
+        do {
+          src_ptr += 2 * src_stride;
+          const __m256i res = y_convolve_6tap_8x2_avx2(
+              src_ptr, src_stride, coeffs_256, s_64, ss_256);
+          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
+
+          ss_256[0] = ss_256[1];
+          ss_256[1] = ss_256[2];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 16) {
+        __m128i s_128[6];
+        __m256i ss_256[6], r[2];
+
+        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
+        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
+        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
+        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
+
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
+        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
+        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
+        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
+
+        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+
+        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
+        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
+
+        y = h;
+        do {
+          src_ptr += 2 * src_stride;
+          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+                                    ss_256, r);
+          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+          ss_256[0] = ss_256[1];
+          ss_256[1] = ss_256[2];
+
+          ss_256[3] = ss_256[4];
+          ss_256[4] = ss_256[5];
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
+
+        assert(!(w % 32));
+
+        x = 0;
+        do {
+          const uint8_t *s = src_ptr + x;
+          uint8_t *d = dst + x;
+
+          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
+          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
+          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
+          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
+          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
+
+          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
+          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
+
+          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
+          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
+
+          y = h;
+          do {
+            s += 2 * src_stride;
+            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
+                                      tt_256, r);
+            sr_y_round_store_32x2_avx2(r, d, dst_stride);
+
+            ss_256[0] = ss_256[1];
+            ss_256[1] = ss_256[2];
+            ss_256[3] = ss_256[4];
+            ss_256[4] = ss_256[5];
+
+            tt_256[0] = tt_256[1];
+            tt_256[1] = tt_256[2];
+            tt_256[3] = tt_256[4];
+            tt_256[4] = tt_256[5];
+            d += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+
+          x += 32;
+        } while (x < w);
+      }
+    }
+  } else if (vert_tap == 8) {
+    // vert_filt as 8 tap
+    const uint8_t *src_ptr = src - 3 * src_stride;
+
+    if (w <= 4) {
+      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+      y = h;
+
+      if (w == 2) {
+        __m128i s_16[8], ss_128[4];
+
+        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
+        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
+        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
+        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
+
+        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
+        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
+        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
+        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
+        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
+
+        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+
+        do {
+          const __m128i res = y_convolve_8tap_2x2_ssse3(
+              src_ptr, src_stride, coeffs_128, s_16, ss_128);
+          const __m128i r = sr_y_round_sse2(res);
+          pack_store_2x2_sse2(r, dst, dst_stride);
+          ss_128[0] = ss_128[1];
+          ss_128[1] = ss_128[2];
+          ss_128[2] = ss_128[3];
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m128i s_32[8], ss_128[4];
+
+        assert(w == 4);
+
+        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
+        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
+        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
+        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
+
+        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
+
+        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+
+        do {
+          const __m128i res = y_convolve_8tap_4x2_ssse3(
+              src_ptr, src_stride, coeffs_128, s_32, ss_128);
+          const __m128i r = sr_y_round_sse2(res);
+          pack_store_4x2_sse2(r, dst, dst_stride);
+          ss_128[0] = ss_128[1];
+          ss_128[1] = ss_128[2];
+          ss_128[2] = ss_128[3];
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      }
+    } else {
+      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+      if (w == 8) {
+        __m128i s_64[8];
+        __m256i ss_256[4];
+
+        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
+        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
+        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
+        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
+        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
+        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
+
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
+        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
+        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
+        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
+        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
+        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
+
+        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+
+        y = h;
+        do {
+          const __m256i res = y_convolve_8tap_8x2_avx2(
+              src_ptr, src_stride, coeffs_256, s_64, ss_256);
+          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
+          ss_256[0] = ss_256[1];
+          ss_256[1] = ss_256[2];
+          ss_256[2] = ss_256[3];
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 16) {
+        __m128i s_128[8];
+        __m256i ss_256[8], r[2];
+
+        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
+        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
+        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
+        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
+        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
+        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
+
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
+        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
+        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
+        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
+        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
+        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
+
+        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+
+        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
+        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
+        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
+
+        y = h;
+        do {
+          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+                                    ss_256, r);
+          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+          ss_256[0] = ss_256[1];
+          ss_256[1] = ss_256[2];
+          ss_256[2] = ss_256[3];
+
+          ss_256[4] = ss_256[5];
+          ss_256[5] = ss_256[6];
+          ss_256[6] = ss_256[7];
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else {
+        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
+
+        assert(!(w % 32));
+
+        x = 0;
+        do {
+          const uint8_t *s = src_ptr + x;
+          uint8_t *d = dst + x;
+
+          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
+          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
+          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
+          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
+          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
+          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
+          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
+
+          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
+          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
+          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
+          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
+
+          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
+          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
+          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
+          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
+
+          y = h;
+          do {
+            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
+                                      tt_256, r);
+            sr_y_round_store_32x2_avx2(r, d, dst_stride);
+
+            ss_256[0] = ss_256[1];
+            ss_256[1] = ss_256[2];
+            ss_256[2] = ss_256[3];
+            ss_256[4] = ss_256[5];
+            ss_256[5] = ss_256[6];
+            ss_256[6] = ss_256[7];
+
+            tt_256[0] = tt_256[1];
+            tt_256[1] = tt_256[2];
+            tt_256[2] = tt_256[3];
+            tt_256[4] = tt_256[5];
+            tt_256[5] = tt_256[6];
+            tt_256[6] = tt_256[7];
+            s += 2 * src_stride;
+            d += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+
+          x += 32;
+        } while (x < w);
+      }
+    }
+  }
+}
+
+static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[1],
+                                     uint8_t *const dst) {
+  __m256i r[2];
+
+  x_convolve_2tap_32_avx2(src, coeffs, r);
+  sr_x_round_store_32_avx2(r, dst);
+}
+
+static INLINE void sr_x_6tap_32_avx2(const uint8_t *const src,
+                                     const __m256i coeffs[3],
+                                     const __m256i filt[3],
+                                     uint8_t *const dst) {
+  __m256i r[2];
+
+  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
+  sr_x_round_store_32_avx2(r, dst);
+}
+
+static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
+                                               const __m256i coeffs[4],
+                                               const __m256i filt[4],
+                                               uint8_t *const dst) {
+  __m256i r[2];
+
+  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
+  sr_x_round_store_32_avx2(r, dst);
+}
+
+static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
+  int32_t y = h;
+  __m128i coeffs_128[4];
+  __m256i coeffs_256[4];
+
+  assert(conv_params->round_0 == 3);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+  (void)conv_params;
+
+  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+  if (horz_tap == 2) {
+    // horz_filt as 2 tap
+    const uint8_t *src_ptr = src;
+
+    if (subpel_x_q4 != 8) {
+      if (w <= 8) {
+        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
+                                       coeffs_128);
+
+        if (w == 2) {
+          do {
+            const __m128i res =
+                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
+            const __m128i r = sr_x_round_sse2(res);
+            pack_store_2x2_sse2(r, dst, dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else if (w == 4) {
+          do {
+            const __m128i res =
+                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+            const __m128i r = sr_x_round_sse2(res);
+            pack_store_4x2_sse2(r, dst, dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else {
+          assert(w == 8);
+
+          do {
+            __m128i res[2];
+
+            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
+            res[0] = sr_x_round_sse2(res[0]);
+            res[1] = sr_x_round_sse2(res[1]);
+            const __m128i d = _mm_packus_epi16(res[0], res[1]);
+            _mm_storel_epi64((__m128i *)dst, d);
+            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
+
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        }
+      } else {
+        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+        if (w == 16) {
+          do {
+            __m256i r[2];
+
+            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
+            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
+            src_ptr += 2 * src_stride;
+            dst += 2 * dst_stride;
+            y -= 2;
+          } while (y);
+        } else if (w == 32) {
+          do {
+            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
+            src_ptr += src_stride;
+            dst += dst_stride;
+          } while (--y);
+        } else if (w == 64) {
+          do {
+            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
+            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
+            src_ptr += src_stride;
+            dst += dst_stride;
+          } while (--y);
+        } else {
+          assert(w == 128);
+
+          do {
+            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
+            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
+            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
+            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
+            src_ptr += src_stride;
+            dst += dst_stride;
+          } while (--y);
+        }
+      }
+    } else {
+      // average to get half pel
+      if (w == 2) {
+        do {
+          __m128i s_128;
+
+          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
+          const __m128i s1 = _mm_srli_si128(s_128, 1);
+          const __m128i d = _mm_avg_epu8(s_128, s1);
+          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
+          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
+
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 4) {
+        do {
+          __m128i s_128;
+
+          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
+          const __m128i s1 = _mm_srli_si128(s_128, 1);
+          const __m128i d = _mm_avg_epu8(s_128, s1);
+          xx_storel_32(dst, d);
+          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
+
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 8) {
+        do {
+          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
+          const __m128i s10 =
+              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
+          const __m128i s01 = _mm_srli_si128(s00, 1);
+          const __m128i s11 = _mm_srli_si128(s10, 1);
+          const __m128i d0 = _mm_avg_epu8(s00, s01);
+          const __m128i d1 = _mm_avg_epu8(s10, s11);
+          _mm_storel_epi64((__m128i *)dst, d0);
+          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
+
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 16) {
+        do {
+          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
+          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
+          const __m128i s10 =
+              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
+          const __m128i s11 =
+              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
+          const __m128i d0 = _mm_avg_epu8(s00, s01);
+          const __m128i d1 = _mm_avg_epu8(s10, s11);
+          _mm_storeu_si128((__m128i *)dst, d0);
+          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
+
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 32) {
+        do {
+          sr_x_2tap_32_avg_avx2(src_ptr, dst);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      } else if (w == 64) {
+        do {
+          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
+          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      } else {
+        assert(w == 128);
+
+        do {
+          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
+          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
+          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
+          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      }
+    }
+  } else if (horz_tap == 4) {
+    // horz_filt as 4 tap
+    const uint8_t *src_ptr = src - 1;
+
+    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
+
+    if (w == 2) {
+      do {
+        const __m128i res =
+            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
+        const __m128i r = sr_x_round_sse2(res);
+        pack_store_2x2_sse2(r, dst, dst_stride);
+        src_ptr += 2 * src_stride;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 4) {
+      do {
+        const __m128i res =
+            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+        const __m128i r = sr_x_round_sse2(res);
+        pack_store_4x2_sse2(r, dst, dst_stride);
+        src_ptr += 2 * src_stride;
+        dst += 2 * dst_stride;
+        y -= 2;
+      } while (y);
+    } else if (w == 8) {
+      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
+      // rewrite this for better performance later.
+      __m256i filt_256[2];
+      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
+
+      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+      for (int i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
+        res_16b = sr_x_round_avx2(res_16b);
+
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+      }
+    } else {
+      assert(!(w % 16));
+      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
+      // rewrite this for better performance later.
+      __m256i filt_256[2];
+      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
+      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b =
+              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
+          res_16b = sr_x_round_avx2(res_16b);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
+      }
+    }
+  } else {
+    __m256i filt_256[4];
+
+    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
+
+    if (horz_tap == 6) {
+      // horz_filt as 6 tap
+      const uint8_t *src_ptr = src - 2;
+
+      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+      if (w == 8) {
+        do {
+          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
+                                                       coeffs_256, filt_256);
+          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 16) {
+        do {
+          __m256i r[2];
+
+          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
+                                    r);
+          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 32) {
+        do {
+          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      } else if (w == 64) {
+        do {
+          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      } else {
+        assert(w == 128);
+
+        do {
+          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
+                            dst + 1 * 32);
+          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
+                            dst + 2 * 32);
+          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
+                            dst + 3 * 32);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      }
+    } else if (horz_tap == 8) {
+      // horz_filt as 8 tap
+      const uint8_t *src_ptr = src - 3;
+
+      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
+
+      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+      if (w == 8) {
+        do {
+          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
+                                                       coeffs_256, filt_256);
+          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 16) {
+        do {
+          __m256i r[2];
+
+          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
+                                    r);
+          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
+          src_ptr += 2 * src_stride;
+          dst += 2 * dst_stride;
+          y -= 2;
+        } while (y);
+      } else if (w == 32) {
+        do {
+          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      } else if (w == 64) {
+        do {
+          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      } else {
+        assert(w == 128);
+
+        do {
+          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
+                            dst + 1 * 32);
+          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
+                            dst + 2 * 32);
+          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
+                            dst + 3 * 32);
+          src_ptr += src_stride;
+          dst += dst_stride;
+        } while (--y);
+      }
+    }
+  }
+}
+
+#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
diff --git a/third_party/SVT-AV1/synonyms.h b/third_party/SVT-AV1/synonyms.h
new file mode 100644
index 000000000..0ded6e5cf
--- /dev/null
+++ b/third_party/SVT-AV1/synonyms.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_THIRD_PARTY_SVT_AV1_SYNONYMS_H_
+#define AOM_THIRD_PARTY_SVT_AV1_SYNONYMS_H_
+
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE __m128i load_u8_8x2_sse2(const uint8_t *const src,
+                                       const ptrdiff_t stride) {
+  return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
+}
+
+static AOM_FORCE_INLINE void store_u8_4x2_sse2(const __m128i src,
+                                               uint8_t *const dst,
+                                               const ptrdiff_t stride) {
+  xx_storel_32(dst, src);
+  *(uint32_t *)(dst + stride) =
+      ((uint32_t)_mm_extract_epi16(src, 3) << 16) | _mm_extract_epi16(src, 2);
+}
+
+#endif  // AOM_THIRD_PARTY_SVT_AV1_SYNONYMS_H_
diff --git a/third_party/googletest/README.libaom b/third_party/googletest/README.libaom
index a461f3672..5e429d4da 100644
--- a/third_party/googletest/README.libaom
+++ b/third_party/googletest/README.libaom
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest
-Version: release-1.11.0
+Version: release-1.12.1
 License: BSD
 License File: LICENSE
 
@@ -13,6 +13,7 @@ generation.
 
 Local Modifications:
 - Remove everything but:
+  .clang-format
   CMakeLists.txt
   CONTRIBUTORS
   googlemock/
@@ -29,3 +30,9 @@ Local Modifications:
    src
   LICENSE
   README.md
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+  GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+  the mingw32 g++ compilation errors caused by the lack of std::mutex
+  and std::condition_variable in the <mutex> and <condition_variable>
+  headers if mingw32 is configured with the win32 threads option. See
+  https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/third_party/googletest/src/.clang-format b/third_party/googletest/src/.clang-format
new file mode 100644
index 000000000..5b9bfe6d2
--- /dev/null
+++ b/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
diff --git a/third_party/googletest/src/CMakeLists.txt b/third_party/googletest/src/CMakeLists.txt
index ea81ab129..102e28cd4 100644
--- a/third_party/googletest/src/CMakeLists.txt
+++ b/third_party/googletest/src/CMakeLists.txt
@@ -1,19 +1,21 @@
 # Note: CMake support is community-based. The maintainers do not use CMake
 # internally.
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.5)
 
 if (POLICY CMP0048)
   cmake_policy(SET CMP0048 NEW)
 endif (POLICY CMP0048)
 
+if (POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif (POLICY CMP0077)
+
 project(googletest-distribution)
-set(GOOGLETEST_VERSION 1.11.0)
+set(GOOGLETEST_VERSION 1.12.1)
 
-if (CMAKE_VERSION VERSION_GREATER "3.0.2")
-  if(NOT CYGWIN AND NOT MSYS AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL QNX)
-    set(CMAKE_CXX_EXTENSIONS OFF)
-  endif()
+if(NOT CYGWIN AND NOT MSYS AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL QNX)
+  set(CMAKE_CXX_EXTENSIONS OFF)
 endif()
 
 enable_testing()
diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS
index 76db0b40f..77397a5b5 100644
--- a/third_party/googletest/src/CONTRIBUTORS
+++ b/third_party/googletest/src/CONTRIBUTORS
@@ -34,6 +34,7 @@ Manuel Klimek <klimek@google.com>
 Mario Tanev <radix@google.com>
 Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
 Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
 Mike Bland <mbland@google.com>
@@ -55,6 +56,7 @@ Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
 Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
 Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md
index 7d872a57e..30edaecf3 100644
--- a/third_party/googletest/src/README.md
+++ b/third_party/googletest/src/README.md
@@ -6,7 +6,8 @@
 
 GoogleTest now follows the
 [Abseil Live at Head philosophy](https://abseil.io/about/philosophy#upgrade-support).
-We recommend using the latest commit in the `master` branch in your projects.
+We recommend
+[updating to the latest commit in the `main` branch as often as possible](https://github.com/abseil/abseil-cpp/blob/master/FAQ.md#what-is-live-at-head-and-how-do-i-do-it).
 
 #### Documentation Updates
 
@@ -14,9 +15,9 @@ Our documentation is now live on GitHub Pages at
 https://google.github.io/googletest/. We recommend browsing the documentation on
 GitHub Pages rather than directly in the repository.
 
-#### Release 1.10.x
+#### Release 1.11.0
 
-[Release 1.10.x](https://github.com/google/googletest/releases/tag/release-1.10.0)
+[Release 1.11.0](https://github.com/google/googletest/releases/tag/release-1.11.0)
 is now available.
 
 #### Coming Soon
@@ -109,8 +110,8 @@ Windows and Linux platforms.
 
 [GoogleTest UI](https://github.com/ospector/gtest-gbar) is a test runner that
 runs your test binary, allows you to track its progress via a progress bar, and
-displays a list of test failures. Clicking on one shows failure text. Google
-Test UI is written in C#.
+displays a list of test failures. Clicking on one shows failure text. GoogleTest
+UI is written in C#.
 
 [GTest TAP Listener](https://github.com/kinow/gtest-tap-listener) is an event
 listener for GoogleTest that implements the
@@ -121,11 +122,11 @@ result output. If your test runner understands TAP, you may find it useful.
 runs tests from your binary in parallel to provide significant speed-up.
 
 [GoogleTest Adapter](https://marketplace.visualstudio.com/items?itemName=DavidSchuldenfrei.gtest-adapter)
-is a VS Code extension allowing to view GoogleTest in a tree view, and run/debug
+is a VS Code extension allowing to view GoogleTest in a tree view and run/debug
 your tests.
 
 [C++ TestMate](https://github.com/matepek/vscode-catch2-test-adapter) is a VS
-Code extension allowing to view GoogleTest in a tree view, and run/debug your
+Code extension allowing to view GoogleTest in a tree view and run/debug your
 tests.
 
 [Cornichon](https://pypi.org/project/cornichon/) is a small Gherkin DSL parser
diff --git a/third_party/googletest/src/googlemock/CMakeLists.txt b/third_party/googletest/src/googlemock/CMakeLists.txt
index e7df8ec53..5c1f0dafe 100644
--- a/third_party/googletest/src/googlemock/CMakeLists.txt
+++ b/third_party/googletest/src/googlemock/CMakeLists.txt
@@ -36,13 +36,9 @@ endif()
 # as ${gmock_SOURCE_DIR} and to the root binary directory as
 # ${gmock_BINARY_DIR}.
 # Language "C" is required for find_package(Threads).
-if (CMAKE_VERSION VERSION_LESS 3.0)
-  project(gmock CXX C)
-else()
-  cmake_policy(SET CMP0048 NEW)
-  project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
-endif()
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.5)
+cmake_policy(SET CMP0048 NEW)
+project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
 
 if (COMMAND set_up_hermetic_build)
   set_up_hermetic_build()
@@ -109,11 +105,12 @@ endif()
 # to the targets for when we are part of a parent build (ie being pulled
 # in via add_subdirectory() rather than being a standalone build).
 if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  string(REPLACE ";" "$<SEMICOLON>" dirs "${gmock_build_include_dirs}")
   target_include_directories(gmock SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<BUILD_INTERFACE:${dirs}>"
     "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
   target_include_directories(gmock_main SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<BUILD_INTERFACE:${dirs}>"
     "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
 endif()
 
@@ -154,7 +151,10 @@ if (gmock_build_tests)
   cxx_test(gmock_ex_test gmock_main)
   cxx_test(gmock-function-mocker_test gmock_main)
   cxx_test(gmock-internal-utils_test gmock_main)
-  cxx_test(gmock-matchers_test gmock_main)
+  cxx_test(gmock-matchers-arithmetic_test gmock_main)
+  cxx_test(gmock-matchers-comparisons_test gmock_main)
+  cxx_test(gmock-matchers-containers_test gmock_main)
+  cxx_test(gmock-matchers-misc_test gmock_main)
   cxx_test(gmock-more-actions_test gmock_main)
   cxx_test(gmock-nice-strict_test gmock_main)
   cxx_test(gmock-port_test gmock_main)
diff --git a/third_party/googletest/src/googlemock/README.md b/third_party/googletest/src/googlemock/README.md
index ead688325..7da60655d 100644
--- a/third_party/googletest/src/googlemock/README.md
+++ b/third_party/googletest/src/googlemock/README.md
@@ -35,10 +35,6 @@ Details and examples can be found here:
 *   [gMock Cookbook](https://google.github.io/googletest/gmock_cook_book.html)
 *   [gMock Cheat Sheet](https://google.github.io/googletest/gmock_cheat_sheet.html)
 
-Please note that code under scripts/generator/ is from the
-[cppclean project](http://code.google.com/p/cppclean/) and under the Apache
-License, which is different from GoogleMock's license.
-
 GoogleMock is a part of
 [GoogleTest C++ testing framework](http://github.com/google/googletest/) and a
 subject to the same requirements.
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h b/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
index f2393bd3a..c785ad8ab 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // The ACTION* family of macros can be used in a namespace scope to
@@ -125,13 +124,14 @@
 // To learn more about using these macros, please search for 'ACTION' on
 // https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
 
 #ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
 #endif
 
 #include <algorithm>
@@ -147,8 +147,8 @@
 #include "gmock/internal/gmock-pp.h"
 
 #ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4100)
+#pragma warning(push)
+#pragma warning(disable : 4100)
 #endif
 
 namespace testing {
@@ -196,9 +196,7 @@ class BuiltInDefaultValue {
  public:
   // This function returns true if and only if type T has a built-in default
   // value.
-  static bool Exists() {
-    return ::std::is_default_constructible<T>::value;
-  }
+  static bool Exists() { return ::std::is_default_constructible<T>::value; }
 
   static T Get() {
     return BuiltInDefaultValueGetter<
@@ -227,11 +225,11 @@ class BuiltInDefaultValue<T*> {
 // The following specializations define the default values for
 // specific types we care about.
 #define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
-  template <> \
-  class BuiltInDefaultValue<type> { \
-   public: \
-    static bool Exists() { return true; } \
-    static type Get() { return value; } \
+  template <>                                                     \
+  class BuiltInDefaultValue<type> {                               \
+   public:                                                        \
+    static bool Exists() { return true; }                         \
+    static type Get() { return value; }                           \
   }
 
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, );  // NOLINT
@@ -255,21 +253,309 @@ GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U);  // NOLINT
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0);     // NOLINT
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);  // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);        // NOLINT
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long long, 0);  // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0);    // NOLINT
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
 GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
 
 #undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
 
-// Simple two-arg form of std::disjunction.
-template <typename P, typename Q>
-using disjunction = typename ::std::conditional<P::value, P, Q>::type;
+// Partial implementations of metaprogramming types from the standard library
+// not available in C++11.
+
+template <typename P>
+struct negation
+    // NOLINTNEXTLINE
+    : std::integral_constant<bool, bool(!P::value)> {};
+
+// Base case: with zero predicates the answer is always true.
+template <typename...>
+struct conjunction : std::true_type {};
+
+// With a single predicate, the answer is that predicate.
+template <typename P1>
+struct conjunction<P1> : P1 {};
+
+// With multiple predicates the answer is the first predicate if that is false,
+// and we recurse otherwise.
+template <typename P1, typename... Ps>
+struct conjunction<P1, Ps...>
+    : std::conditional<bool(P1::value), conjunction<Ps...>, P1>::type {};
+
+template <typename...>
+struct disjunction : std::false_type {};
+
+template <typename P1>
+struct disjunction<P1> : P1 {};
+
+template <typename P1, typename... Ps>
+struct disjunction<P1, Ps...>
+    // NOLINTNEXTLINE
+    : std::conditional<!bool(P1::value), disjunction<Ps...>, P1>::type {};
+
+template <typename...>
+using void_t = void;
+
+// Detects whether an expression of type `From` can be implicitly converted to
+// `To` according to [conv]. In C++17, [conv]/3 defines this as follows:
+//
+//     An expression e can be implicitly converted to a type T if and only if
+//     the declaration T t=e; is well-formed, for some invented temporary
+//     variable t ([dcl.init]).
+//
+// [conv]/2 implies we can use function argument passing to detect whether this
+// initialization is valid.
+//
+// Note that this is distinct from is_convertible, which requires this be valid:
+//
+//     To test() {
+//       return declval<From>();
+//     }
+//
+// In particular, is_convertible doesn't give the correct answer when `To` and
+// `From` are the same non-moveable type since `declval<From>` will be an rvalue
+// reference, defeating the guaranteed copy elision that would otherwise make
+// this function work.
+//
+// REQUIRES: `From` is not cv void.
+template <typename From, typename To>
+struct is_implicitly_convertible {
+ private:
+  // A function that accepts a parameter of type T. This can be called with type
+  // U successfully only if U is implicitly convertible to T.
+  template <typename T>
+  static void Accept(T);
+
+  // A function that creates a value of type T.
+  template <typename T>
+  static T Make();
+
+  // An overload be selected when implicit conversion from T to To is possible.
+  template <typename T, typename = decltype(Accept<To>(Make<T>()))>
+  static std::true_type TestImplicitConversion(int);
+
+  // A fallback overload selected in all other cases.
+  template <typename T>
+  static std::false_type TestImplicitConversion(...);
+
+ public:
+  using type = decltype(TestImplicitConversion<From>(0));
+  static constexpr bool value = type::value;
+};
+
+// Like std::invoke_result_t from C++17, but works only for objects with call
+// operators (not e.g. member function pointers, which we don't need specific
+// support for in OnceAction because std::function deals with them).
+template <typename F, typename... Args>
+using call_result_t = decltype(std::declval<F>()(std::declval<Args>()...));
+
+template <typename Void, typename R, typename F, typename... Args>
+struct is_callable_r_impl : std::false_type {};
+
+// Specialize the struct for those template arguments where call_result_t is
+// well-formed. When it's not, the generic template above is chosen, resulting
+// in std::false_type.
+template <typename R, typename F, typename... Args>
+struct is_callable_r_impl<void_t<call_result_t<F, Args...>>, R, F, Args...>
+    : std::conditional<
+          std::is_void<R>::value,  //
+          std::true_type,          //
+          is_implicitly_convertible<call_result_t<F, Args...>, R>>::type {};
+
+// Like std::is_invocable_r from C++17, but works only for objects with call
+// operators. See the note on call_result_t.
+template <typename R, typename F, typename... Args>
+using is_callable_r = is_callable_r_impl<void, R, F, Args...>;
+
+// Like std::as_const from C++17.
+template <typename T>
+typename std::add_const<T>::type& as_const(T& t) {
+  return t;
+}
 
 }  // namespace internal
 
+// Specialized for function types below.
+template <typename F>
+class OnceAction;
+
+// An action that can only be used once.
+//
+// This is accepted by WillOnce, which doesn't require the underlying action to
+// be copy-constructible (only move-constructible), and promises to invoke it as
+// an rvalue reference. This allows the action to work with move-only types like
+// std::move_only_function in a type-safe manner.
+//
+// For example:
+//
+//     // Assume we have some API that needs to accept a unique pointer to some
+//     // non-copyable object Foo.
+//     void AcceptUniquePointer(std::unique_ptr<Foo> foo);
+//
+//     // We can define an action that provides a Foo to that API. Because It
+//     // has to give away its unique pointer, it must not be called more than
+//     // once, so its call operator is &&-qualified.
+//     struct ProvideFoo {
+//       std::unique_ptr<Foo> foo;
+//
+//       void operator()() && {
+//         AcceptUniquePointer(std::move(Foo));
+//       }
+//     };
+//
+//     // This action can be used with WillOnce.
+//     EXPECT_CALL(mock, Call)
+//         .WillOnce(ProvideFoo{std::make_unique<Foo>(...)});
+//
+//     // But a call to WillRepeatedly will fail to compile. This is correct,
+//     // since the action cannot correctly be used repeatedly.
+//     EXPECT_CALL(mock, Call)
+//         .WillRepeatedly(ProvideFoo{std::make_unique<Foo>(...)});
+//
+// A less-contrived example would be an action that returns an arbitrary type,
+// whose &&-qualified call operator is capable of dealing with move-only types.
+template <typename Result, typename... Args>
+class OnceAction<Result(Args...)> final {
+ private:
+  // True iff we can use the given callable type (or lvalue reference) directly
+  // via StdFunctionAdaptor.
+  template <typename Callable>
+  using IsDirectlyCompatible = internal::conjunction<
+      // It must be possible to capture the callable in StdFunctionAdaptor.
+      std::is_constructible<typename std::decay<Callable>::type, Callable>,
+      // The callable must be compatible with our signature.
+      internal::is_callable_r<Result, typename std::decay<Callable>::type,
+                              Args...>>;
+
+  // True iff we can use the given callable type via StdFunctionAdaptor once we
+  // ignore incoming arguments.
+  template <typename Callable>
+  using IsCompatibleAfterIgnoringArguments = internal::conjunction<
+      // It must be possible to capture the callable in a lambda.
+      std::is_constructible<typename std::decay<Callable>::type, Callable>,
+      // The callable must be invocable with zero arguments, returning something
+      // convertible to Result.
+      internal::is_callable_r<Result, typename std::decay<Callable>::type>>;
+
+ public:
+  // Construct from a callable that is directly compatible with our mocked
+  // signature: it accepts our function type's arguments and returns something
+  // convertible to our result type.
+  template <typename Callable,
+            typename std::enable_if<
+                internal::conjunction<
+                    // Teach clang on macOS that we're not talking about a
+                    // copy/move constructor here. Otherwise it gets confused
+                    // when checking the is_constructible requirement of our
+                    // traits above.
+                    internal::negation<std::is_same<
+                        OnceAction, typename std::decay<Callable>::type>>,
+                    IsDirectlyCompatible<Callable>>  //
+                ::value,
+                int>::type = 0>
+  OnceAction(Callable&& callable)  // NOLINT
+      : function_(StdFunctionAdaptor<typename std::decay<Callable>::type>(
+            {}, std::forward<Callable>(callable))) {}
+
+  // As above, but for a callable that ignores the mocked function's arguments.
+  template <typename Callable,
+            typename std::enable_if<
+                internal::conjunction<
+                    // Teach clang on macOS that we're not talking about a
+                    // copy/move constructor here. Otherwise it gets confused
+                    // when checking the is_constructible requirement of our
+                    // traits above.
+                    internal::negation<std::is_same<
+                        OnceAction, typename std::decay<Callable>::type>>,
+                    // Exclude callables for which the overload above works.
+                    // We'd rather provide the arguments if possible.
+                    internal::negation<IsDirectlyCompatible<Callable>>,
+                    IsCompatibleAfterIgnoringArguments<Callable>>::value,
+                int>::type = 0>
+  OnceAction(Callable&& callable)  // NOLINT
+                                   // Call the constructor above with a callable
+                                   // that ignores the input arguments.
+      : OnceAction(IgnoreIncomingArguments<typename std::decay<Callable>::type>{
+            std::forward<Callable>(callable)}) {}
+
+  // We are naturally copyable because we store only an std::function, but
+  // semantically we should not be copyable.
+  OnceAction(const OnceAction&) = delete;
+  OnceAction& operator=(const OnceAction&) = delete;
+  OnceAction(OnceAction&&) = default;
+
+  // Invoke the underlying action callable with which we were constructed,
+  // handing it the supplied arguments.
+  Result Call(Args... args) && {
+    return function_(std::forward<Args>(args)...);
+  }
+
+ private:
+  // An adaptor that wraps a callable that is compatible with our signature and
+  // being invoked as an rvalue reference so that it can be used as an
+  // StdFunctionAdaptor. This throws away type safety, but that's fine because
+  // this is only used by WillOnce, which we know calls at most once.
+  //
+  // Once we have something like std::move_only_function from C++23, we can do
+  // away with this.
+  template <typename Callable>
+  class StdFunctionAdaptor final {
+   public:
+    // A tag indicating that the (otherwise universal) constructor is accepting
+    // the callable itself, instead of e.g. stealing calls for the move
+    // constructor.
+    struct CallableTag final {};
+
+    template <typename F>
+    explicit StdFunctionAdaptor(CallableTag, F&& callable)
+        : callable_(std::make_shared<Callable>(std::forward<F>(callable))) {}
+
+    // Rather than explicitly returning Result, we return whatever the wrapped
+    // callable returns. This allows for compatibility with existing uses like
+    // the following, when the mocked function returns void:
+    //
+    //     EXPECT_CALL(mock_fn_, Call)
+    //         .WillOnce([&] {
+    //            [...]
+    //            return 0;
+    //         });
+    //
+    // Such a callable can be turned into std::function<void()>. If we use an
+    // explicit return type of Result here then it *doesn't* work with
+    // std::function, because we'll get a "void function should not return a
+    // value" error.
+    //
+    // We need not worry about incompatible result types because the SFINAE on
+    // OnceAction already checks this for us. std::is_invocable_r_v itself makes
+    // the same allowance for void result types.
+    template <typename... ArgRefs>
+    internal::call_result_t<Callable, ArgRefs...> operator()(
+        ArgRefs&&... args) const {
+      return std::move(*callable_)(std::forward<ArgRefs>(args)...);
+    }
+
+   private:
+    // We must put the callable on the heap so that we are copyable, which
+    // std::function needs.
+    std::shared_ptr<Callable> callable_;
+  };
+
+  // An adaptor that makes a callable that accepts zero arguments callable with
+  // our mocked arguments.
+  template <typename Callable>
+  struct IgnoreIncomingArguments {
+    internal::call_result_t<Callable> operator()(Args&&...) {
+      return std::move(callable)();
+    }
+
+    Callable callable;
+  };
+
+  std::function<Result(Args...)> function_;
+};
+
 // When an unexpected function call is encountered, Google Mock will
 // let it return a default value if the user has specified one for its
 // return type, or if the return type has a built-in default value;
@@ -339,7 +625,8 @@ class DefaultValue {
 
    private:
     const T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(FixedValueProducer);
+    FixedValueProducer(const FixedValueProducer&) = delete;
+    FixedValueProducer& operator=(const FixedValueProducer&) = delete;
   };
 
   class FactoryValueProducer : public ValueProducer {
@@ -350,7 +637,8 @@ class DefaultValue {
 
    private:
     const FactoryFunction factory_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(FactoryValueProducer);
+    FactoryValueProducer(const FactoryValueProducer&) = delete;
+    FactoryValueProducer& operator=(const FactoryValueProducer&) = delete;
   };
 
   static ValueProducer* producer_;
@@ -424,28 +712,34 @@ class ActionInterface {
   virtual Result Perform(const ArgumentTuple& args) = 0;
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionInterface);
+  ActionInterface(const ActionInterface&) = delete;
+  ActionInterface& operator=(const ActionInterface&) = delete;
 };
 
-// An Action<F> is a copyable and IMMUTABLE (except by assignment)
-// object that represents an action to be taken when a mock function
-// of type F is called.  The implementation of Action<T> is just a
-// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action!
-// You can view an object implementing ActionInterface<F> as a
-// concrete action (including its current state), and an Action<F>
-// object as a handle to it.
 template <typename F>
-class Action {
+class Action;
+
+// An Action<R(Args...)> is a copyable and IMMUTABLE (except by assignment)
+// object that represents an action to be taken when a mock function of type
+// R(Args...) is called. The implementation of Action<T> is just a
+// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action! You
+// can view an object implementing ActionInterface<F> as a concrete action
+// (including its current state), and an Action<F> object as a handle to it.
+template <typename R, typename... Args>
+class Action<R(Args...)> {
+ private:
+  using F = R(Args...);
+
   // Adapter class to allow constructing Action from a legacy ActionInterface.
   // New code should create Actions from functors instead.
   struct ActionAdapter {
     // Adapter must be copyable to satisfy std::function requirements.
     ::std::shared_ptr<ActionInterface<F>> impl_;
 
-    template <typename... Args>
-    typename internal::Function<F>::Result operator()(Args&&... args) {
+    template <typename... InArgs>
+    typename internal::Function<F>::Result operator()(InArgs&&... args) {
       return impl_->Perform(
-          ::std::forward_as_tuple(::std::forward<Args>(args)...));
+          ::std::forward_as_tuple(::std::forward<InArgs>(args)...));
     }
   };
 
@@ -480,7 +774,8 @@ class Action {
   // Action<F>, as long as F's arguments can be implicitly converted
   // to Func's and Func's return type can be implicitly converted to F's.
   template <typename Func>
-  explicit Action(const Action<Func>& action) : fun_(action.fun_) {}
+  Action(const Action<Func>& action)  // NOLINT
+      : fun_(action.fun_) {}
 
   // Returns true if and only if this is the DoDefault() action.
   bool IsDoDefault() const { return fun_ == nullptr; }
@@ -498,6 +793,24 @@ class Action {
     return internal::Apply(fun_, ::std::move(args));
   }
 
+  // An action can be used as a OnceAction, since it's obviously safe to call it
+  // once.
+  operator OnceAction<F>() const {  // NOLINT
+    // Return a OnceAction-compatible callable that calls Perform with the
+    // arguments it is provided. We could instead just return fun_, but then
+    // we'd need to handle the IsDoDefault() case separately.
+    struct OA {
+      Action<F> action;
+
+      R operator()(Args... args) && {
+        return action.Perform(
+            std::forward_as_tuple(std::forward<Args>(args)...));
+      }
+    };
+
+    return OA{*this};
+  }
+
  private:
   template <typename G>
   friend class Action;
@@ -514,8 +827,8 @@ class Action {
 
   template <typename FunctionImpl>
   struct IgnoreArgs {
-    template <typename... Args>
-    Result operator()(const Args&...) const {
+    template <typename... InArgs>
+    Result operator()(const InArgs&...) const {
       return function_impl();
     }
 
@@ -606,118 +919,198 @@ struct ByMoveWrapper {
   T payload;
 };
 
-// Implements the polymorphic Return(x) action, which can be used in
-// any function that returns the type of x, regardless of the argument
-// types.
-//
-// Note: The value passed into Return must be converted into
-// Function<F>::Result when this action is cast to Action<F> rather than
-// when that action is performed. This is important in scenarios like
-//
-// MOCK_METHOD1(Method, T(U));
-// ...
-// {
-//   Foo foo;
-//   X x(&foo);
-//   EXPECT_CALL(mock, Method(_)).WillOnce(Return(x));
-// }
-//
-// In the example above the variable x holds reference to foo which leaves
-// scope and gets destroyed.  If copying X just copies a reference to foo,
-// that copy will be left with a hanging reference.  If conversion to T
-// makes a copy of foo, the above code is safe. To support that scenario, we
-// need to make sure that the type conversion happens inside the EXPECT_CALL
-// statement, and conversion of the result of Return to Action<T(U)> is a
-// good place for that.
-//
-// The real life example of the above scenario happens when an invocation
-// of gtl::Container() is passed into Return.
-//
+// The general implementation of Return(R). Specializations follow below.
 template <typename R>
-class ReturnAction {
+class ReturnAction final {
  public:
-  // Constructs a ReturnAction object from the value to be returned.
-  // 'value' is passed by value instead of by const reference in order
-  // to allow Return("string literal") to compile.
-  explicit ReturnAction(R value) : value_(new R(std::move(value))) {}
+  explicit ReturnAction(R value) : value_(std::move(value)) {}
+
+  template <typename U, typename... Args,
+            typename = typename std::enable_if<conjunction<
+                // See the requirements documented on Return.
+                negation<std::is_same<void, U>>,  //
+                negation<std::is_reference<U>>,   //
+                std::is_convertible<R, U>,        //
+                std::is_move_constructible<U>>::value>::type>
+  operator OnceAction<U(Args...)>() && {  // NOLINT
+    return Impl<U>(std::move(value_));
+  }
 
-  // This template type conversion operator allows Return(x) to be
-  // used in ANY function that returns x's type.
-  template <typename F>
-  operator Action<F>() const {  // NOLINT
-    // Assert statement belongs here because this is the best place to verify
-    // conditions on F. It produces the clearest error messages
-    // in most compilers.
-    // Impl really belongs in this scope as a local class but can't
-    // because MSVC produces duplicate symbols in different translation units
-    // in this case. Until MS fixes that bug we put Impl into the class scope
-    // and put the typedef both here (for use in assert statement) and
-    // in the Impl class. But both definitions must be the same.
-    typedef typename Function<F>::Result Result;
-    GTEST_COMPILE_ASSERT_(
-        !std::is_reference<Result>::value,
-        use_ReturnRef_instead_of_Return_to_return_a_reference);
-    static_assert(!std::is_void<Result>::value,
-                  "Can't use Return() on an action expected to return `void`.");
-    return Action<F>(new Impl<R, F>(value_));
+  template <typename U, typename... Args,
+            typename = typename std::enable_if<conjunction<
+                // See the requirements documented on Return.
+                negation<std::is_same<void, U>>,   //
+                negation<std::is_reference<U>>,    //
+                std::is_convertible<const R&, U>,  //
+                std::is_copy_constructible<U>>::value>::type>
+  operator Action<U(Args...)>() const {  // NOLINT
+    return Impl<U>(value_);
   }
 
  private:
-  // Implements the Return(x) action for a particular function type F.
-  template <typename R_, typename F>
-  class Impl : public ActionInterface<F> {
+  // Implements the Return(x) action for a mock function that returns type U.
+  template <typename U>
+  class Impl final {
    public:
-    typedef typename Function<F>::Result Result;
-    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+    // The constructor used when the return value is allowed to move from the
+    // input value (i.e. we are converting to OnceAction).
+    explicit Impl(R&& input_value)
+        : state_(new State(std::move(input_value))) {}
 
-    // The implicit cast is necessary when Result has more than one
-    // single-argument constructor (e.g. Result is std::vector<int>) and R
-    // has a type conversion operator template.  In that case, value_(value)
-    // won't compile as the compiler doesn't known which constructor of
-    // Result to call.  ImplicitCast_ forces the compiler to convert R to
-    // Result without considering explicit constructors, thus resolving the
-    // ambiguity. value_ is then initialized using its copy constructor.
-    explicit Impl(const std::shared_ptr<R>& value)
-        : value_before_cast_(*value),
-          value_(ImplicitCast_<Result>(value_before_cast_)) {}
+    // The constructor used when the return value is not allowed to move from
+    // the input value (i.e. we are converting to Action).
+    explicit Impl(const R& input_value) : state_(new State(input_value)) {}
 
-    Result Perform(const ArgumentTuple&) override { return value_; }
+    U operator()() && { return std::move(state_->value); }
+    U operator()() const& { return state_->value; }
 
    private:
-    GTEST_COMPILE_ASSERT_(!std::is_reference<Result>::value,
-                          Result_cannot_be_a_reference_type);
-    // We save the value before casting just in case it is being cast to a
-    // wrapper type.
-    R value_before_cast_;
-    Result value_;
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+    // We put our state on the heap so that the compiler-generated copy/move
+    // constructors work correctly even when U is a reference-like type. This is
+    // necessary only because we eagerly create State::value (see the note on
+    // that symbol for details). If we instead had only the input value as a
+    // member then the default constructors would work fine.
+    //
+    // For example, when R is std::string and U is std::string_view, value is a
+    // reference to the string backed by input_value. The copy constructor would
+    // copy both, so that we wind up with a new input_value object (with the
+    // same contents) and a reference to the *old* input_value object rather
+    // than the new one.
+    struct State {
+      explicit State(const R& input_value_in)
+          : input_value(input_value_in),
+            // Make an implicit conversion to Result before initializing the U
+            // object we store, avoiding calling any explicit constructor of U
+            // from R.
+            //
+            // This simulates the language rules: a function with return type U
+            // that does `return R()` requires R to be implicitly convertible to
+            // U, and uses that path for the conversion, even U Result has an
+            // explicit constructor from R.
+            value(ImplicitCast_<U>(internal::as_const(input_value))) {}
+
+      // As above, but for the case where we're moving from the ReturnAction
+      // object because it's being used as a OnceAction.
+      explicit State(R&& input_value_in)
+          : input_value(std::move(input_value_in)),
+            // For the same reason as above we make an implicit conversion to U
+            // before initializing the value.
+            //
+            // Unlike above we provide the input value as an rvalue to the
+            // implicit conversion because this is a OnceAction: it's fine if it
+            // wants to consume the input value.
+            value(ImplicitCast_<U>(std::move(input_value))) {}
+
+      // A copy of the value originally provided by the user. We retain this in
+      // addition to the value of the mock function's result type below in case
+      // the latter is a reference-like type. See the std::string_view example
+      // in the documentation on Return.
+      R input_value;
+
+      // The value we actually return, as the type returned by the mock function
+      // itself.
+      //
+      // We eagerly initialize this here, rather than lazily doing the implicit
+      // conversion automatically each time Perform is called, for historical
+      // reasons: in 2009-11, commit a070cbd91c (Google changelist 13540126)
+      // made the Action<U()> conversion operator eagerly convert the R value to
+      // U, but without keeping the R alive. This broke the use case discussed
+      // in the documentation for Return, making reference-like types such as
+      // std::string_view not safe to use as U where the input type R is a
+      // value-like type such as std::string.
+      //
+      // The example the commit gave was not very clear, nor was the issue
+      // thread (https://github.com/google/googlemock/issues/86), but it seems
+      // the worry was about reference-like input types R that flatten to a
+      // value-like type U when being implicitly converted. An example of this
+      // is std::vector<bool>::reference, which is often a proxy type with an
+      // reference to the underlying vector:
+      //
+      //     // Helper method: have the mock function return bools according
+      //     // to the supplied script.
+      //     void SetActions(MockFunction<bool(size_t)>& mock,
+      //                     const std::vector<bool>& script) {
+      //       for (size_t i = 0; i < script.size(); ++i) {
+      //         EXPECT_CALL(mock, Call(i)).WillOnce(Return(script[i]));
+      //       }
+      //     }
+      //
+      //     TEST(Foo, Bar) {
+      //       // Set actions using a temporary vector, whose operator[]
+      //       // returns proxy objects that references that will be
+      //       // dangling once the call to SetActions finishes and the
+      //       // vector is destroyed.
+      //       MockFunction<bool(size_t)> mock;
+      //       SetActions(mock, {false, true});
+      //
+      //       EXPECT_FALSE(mock.AsStdFunction()(0));
+      //       EXPECT_TRUE(mock.AsStdFunction()(1));
+      //     }
+      //
+      // This eager conversion helps with a simple case like this, but doesn't
+      // fully make these types work in general. For example the following still
+      // uses a dangling reference:
+      //
+      //     TEST(Foo, Baz) {
+      //       MockFunction<std::vector<std::string>()> mock;
+      //
+      //       // Return the same vector twice, and then the empty vector
+      //       // thereafter.
+      //       auto action = Return(std::initializer_list<std::string>{
+      //           "taco", "burrito",
+      //       });
+      //
+      //       EXPECT_CALL(mock, Call)
+      //           .WillOnce(action)
+      //           .WillOnce(action)
+      //           .WillRepeatedly(Return(std::vector<std::string>{}));
+      //
+      //       EXPECT_THAT(mock.AsStdFunction()(),
+      //                   ElementsAre("taco", "burrito"));
+      //       EXPECT_THAT(mock.AsStdFunction()(),
+      //                   ElementsAre("taco", "burrito"));
+      //       EXPECT_THAT(mock.AsStdFunction()(), IsEmpty());
+      //     }
+      //
+      U value;
+    };
+
+    const std::shared_ptr<State> state_;
   };
 
-  // Partially specialize for ByMoveWrapper. This version of ReturnAction will
-  // move its contents instead.
-  template <typename R_, typename F>
-  class Impl<ByMoveWrapper<R_>, F> : public ActionInterface<F> {
-   public:
-    typedef typename Function<F>::Result Result;
-    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  R value_;
+};
+
+// A specialization of ReturnAction<R> when R is ByMoveWrapper<T> for some T.
+//
+// This version applies the type system-defeating hack of moving from T even in
+// the const call operator, checking at runtime that it isn't called more than
+// once, since the user has declared their intent to do so by using ByMove.
+template <typename T>
+class ReturnAction<ByMoveWrapper<T>> final {
+ public:
+  explicit ReturnAction(ByMoveWrapper<T> wrapper)
+      : state_(new State(std::move(wrapper.payload))) {}
 
-    explicit Impl(const std::shared_ptr<R>& wrapper)
-        : performed_(false), wrapper_(wrapper) {}
+  T operator()() const {
+    GTEST_CHECK_(!state_->called)
+        << "A ByMove() action must be performed at most once.";
 
-    Result Perform(const ArgumentTuple&) override {
-      GTEST_CHECK_(!performed_)
-          << "A ByMove() action should only be performed once.";
-      performed_ = true;
-      return std::move(wrapper_->payload);
-    }
+    state_->called = true;
+    return std::move(state_->value);
+  }
 
-   private:
-    bool performed_;
-    const std::shared_ptr<R> wrapper_;
+ private:
+  // We store our state on the heap so that we are copyable as required by
+  // Action, despite the fact that we are stateful and T may not be copyable.
+  struct State {
+    explicit State(T&& value_in) : value(std::move(value_in)) {}
+
+    T value;
+    bool called = false;
   };
 
-  const std::shared_ptr<R> value_;
+  const std::shared_ptr<State> state_;
 };
 
 // Implements the ReturnNull() action.
@@ -759,8 +1152,8 @@ class ReturnRefAction {
     // Asserts that the function return type is a reference.  This
     // catches the user error of using ReturnRef(x) when Return(x)
     // should be used, and generates some helpful error message.
-    GTEST_COMPILE_ASSERT_(std::is_reference<Result>::value,
-                          use_Return_instead_of_ReturnRef_to_return_a_value);
+    static_assert(std::is_reference<Result>::value,
+                  "use Return instead of ReturnRef to return a value");
     return Action<F>(new Impl<F>(ref_));
   }
 
@@ -801,9 +1194,8 @@ class ReturnRefOfCopyAction {
     // Asserts that the function return type is a reference.  This
     // catches the user error of using ReturnRefOfCopy(x) when Return(x)
     // should be used, and generates some helpful error message.
-    GTEST_COMPILE_ASSERT_(
-        std::is_reference<Result>::value,
-        use_Return_instead_of_ReturnRefOfCopy_to_return_a_value);
+    static_assert(std::is_reference<Result>::value,
+                  "use Return instead of ReturnRefOfCopy to return a value");
     return Action<F>(new Impl<F>(value_));
   }
 
@@ -839,7 +1231,7 @@ class ReturnRoundRobinAction {
 
   template <typename... Args>
   T operator()(Args&&...) const {
-     return state_->Next();
+    return state_->Next();
   }
 
  private:
@@ -862,7 +1254,9 @@ class DoDefaultAction {
   // This template type conversion operator allows DoDefault() to be
   // used in any function.
   template <typename F>
-  operator Action<F>() const { return Action<F>(); }  // NOLINT
+  operator Action<F>() const {
+    return Action<F>();
+  }  // NOLINT
 };
 
 // Implements the Assign action to set a given pointer referent to a
@@ -890,8 +1284,7 @@ template <typename T>
 class SetErrnoAndReturnAction {
  public:
   SetErrnoAndReturnAction(int errno_value, T result)
-      : errno_(errno_value),
-        result_(result) {}
+      : errno_(errno_value), result_(result) {}
   template <typename Result, typename ArgumentTuple>
   Result Perform(const ArgumentTuple& /* args */) const {
     errno = errno_;
@@ -1002,8 +1395,8 @@ class IgnoreResultAction {
    private:
     // Type OriginalFunction is the same as F except that its return
     // type is IgnoredValue.
-    typedef typename internal::Function<F>::MakeResultIgnoredValue
-        OriginalFunction;
+    typedef
+        typename internal::Function<F>::MakeResultIgnoredValue OriginalFunction;
 
     const Action<OriginalFunction> action_;
   };
@@ -1013,55 +1406,239 @@ class IgnoreResultAction {
 
 template <typename InnerAction, size_t... I>
 struct WithArgsAction {
-  InnerAction action;
+  InnerAction inner_action;
 
-  // The inner action could be anything convertible to Action<X>.
-  // We use the conversion operator to detect the signature of the inner Action.
+  // The signature of the function as seen by the inner action, given an out
+  // action with the given result and argument types.
   template <typename R, typename... Args>
+  using InnerSignature =
+      R(typename std::tuple_element<I, std::tuple<Args...>>::type...);
+
+  // Rather than a call operator, we must define conversion operators to
+  // particular action types. This is necessary for embedded actions like
+  // DoDefault(), which rely on an action conversion operators rather than
+  // providing a call operator because even with a particular set of arguments
+  // they don't have a fixed return type.
+
+  template <typename R, typename... Args,
+            typename std::enable_if<
+                std::is_convertible<
+                    InnerAction,
+                    // Unfortunately we can't use the InnerSignature alias here;
+                    // MSVC complains about the I parameter pack not being
+                    // expanded (error C3520) despite it being expanded in the
+                    // type alias.
+                    OnceAction<R(typename std::tuple_element<
+                                 I, std::tuple<Args...>>::type...)>>::value,
+                int>::type = 0>
+  operator OnceAction<R(Args...)>() && {  // NOLINT
+    struct OA {
+      OnceAction<InnerSignature<R, Args...>> inner_action;
+
+      R operator()(Args&&... args) && {
+        return std::move(inner_action)
+            .Call(std::get<I>(
+                std::forward_as_tuple(std::forward<Args>(args)...))...);
+      }
+    };
+
+    return OA{std::move(inner_action)};
+  }
+
+  template <typename R, typename... Args,
+            typename std::enable_if<
+                std::is_convertible<
+                    const InnerAction&,
+                    // Unfortunately we can't use the InnerSignature alias here;
+                    // MSVC complains about the I parameter pack not being
+                    // expanded (error C3520) despite it being expanded in the
+                    // type alias.
+                    Action<R(typename std::tuple_element<
+                             I, std::tuple<Args...>>::type...)>>::value,
+                int>::type = 0>
   operator Action<R(Args...)>() const {  // NOLINT
-    using TupleType = std::tuple<Args...>;
-    Action<R(typename std::tuple_element<I, TupleType>::type...)>
-        converted(action);
+    Action<InnerSignature<R, Args...>> converted(inner_action);
 
-    return [converted](Args... args) -> R {
+    return [converted](Args&&... args) -> R {
       return converted.Perform(std::forward_as_tuple(
-        std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
+          std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
     };
   }
 };
 
 template <typename... Actions>
-struct DoAllAction {
- private:
+class DoAllAction;
+
+// Base case: only a single action.
+template <typename FinalAction>
+class DoAllAction<FinalAction> {
+ public:
+  struct UserConstructorTag {};
+
   template <typename T>
-  using NonFinalType =
-      typename std::conditional<std::is_scalar<T>::value, T, const T&>::type;
+  explicit DoAllAction(UserConstructorTag, T&& action)
+      : final_action_(std::forward<T>(action)) {}
+
+  // Rather than a call operator, we must define conversion operators to
+  // particular action types. This is necessary for embedded actions like
+  // DoDefault(), which rely on an action conversion operators rather than
+  // providing a call operator because even with a particular set of arguments
+  // they don't have a fixed return type.
+
+  template <typename R, typename... Args,
+            typename std::enable_if<
+                std::is_convertible<FinalAction, OnceAction<R(Args...)>>::value,
+                int>::type = 0>
+  operator OnceAction<R(Args...)>() && {  // NOLINT
+    return std::move(final_action_);
+  }
 
-  template <typename ActionT, size_t... I>
-  std::vector<ActionT> Convert(IndexSequence<I...>) const {
-    return {ActionT(std::get<I>(actions))...};
+  template <
+      typename R, typename... Args,
+      typename std::enable_if<
+          std::is_convertible<const FinalAction&, Action<R(Args...)>>::value,
+          int>::type = 0>
+  operator Action<R(Args...)>() const {  // NOLINT
+    return final_action_;
   }
 
+ private:
+  FinalAction final_action_;
+};
+
+// Recursive case: support N actions by calling the initial action and then
+// calling through to the base class containing N-1 actions.
+template <typename InitialAction, typename... OtherActions>
+class DoAllAction<InitialAction, OtherActions...>
+    : private DoAllAction<OtherActions...> {
+ private:
+  using Base = DoAllAction<OtherActions...>;
+
+  // The type of reference that should be provided to an initial action for a
+  // mocked function parameter of type T.
+  //
+  // There are two quirks here:
+  //
+  //  *  Unlike most forwarding functions, we pass scalars through by value.
+  //     This isn't strictly necessary because an lvalue reference would work
+  //     fine too and be consistent with other non-reference types, but it's
+  //     perhaps less surprising.
+  //
+  //     For example if the mocked function has signature void(int), then it
+  //     might seem surprising for the user's initial action to need to be
+  //     convertible to Action<void(const int&)>. This is perhaps less
+  //     surprising for a non-scalar type where there may be a performance
+  //     impact, or it might even be impossible, to pass by value.
+  //
+  //  *  More surprisingly, `const T&` is often not a const reference type.
+  //     By the reference collapsing rules in C++17 [dcl.ref]/6, if T refers to
+  //     U& or U&& for some non-scalar type U, then InitialActionArgType<T> is
+  //     U&. In other words, we may hand over a non-const reference.
+  //
+  //     So for example, given some non-scalar type Obj we have the following
+  //     mappings:
+  //
+  //            T               InitialActionArgType<T>
+  //         -------            -----------------------
+  //         Obj                const Obj&
+  //         Obj&               Obj&
+  //         Obj&&              Obj&
+  //         const Obj          const Obj&
+  //         const Obj&         const Obj&
+  //         const Obj&&        const Obj&
+  //
+  //     In other words, the initial actions get a mutable view of an non-scalar
+  //     argument if and only if the mock function itself accepts a non-const
+  //     reference type. They are never given an rvalue reference to an
+  //     non-scalar type.
+  //
+  //     This situation makes sense if you imagine use with a matcher that is
+  //     designed to write through a reference. For example, if the caller wants
+  //     to fill in a reference argument and then return a canned value:
+  //
+  //         EXPECT_CALL(mock, Call)
+  //             .WillOnce(DoAll(SetArgReferee<0>(17), Return(19)));
+  //
+  template <typename T>
+  using InitialActionArgType =
+      typename std::conditional<std::is_scalar<T>::value, T, const T&>::type;
+
  public:
-  std::tuple<Actions...> actions;
+  struct UserConstructorTag {};
+
+  template <typename T, typename... U>
+  explicit DoAllAction(UserConstructorTag, T&& initial_action,
+                       U&&... other_actions)
+      : Base({}, std::forward<U>(other_actions)...),
+        initial_action_(std::forward<T>(initial_action)) {}
+
+  template <typename R, typename... Args,
+            typename std::enable_if<
+                conjunction<
+                    // Both the initial action and the rest must support
+                    // conversion to OnceAction.
+                    std::is_convertible<
+                        InitialAction,
+                        OnceAction<void(InitialActionArgType<Args>...)>>,
+                    std::is_convertible<Base, OnceAction<R(Args...)>>>::value,
+                int>::type = 0>
+  operator OnceAction<R(Args...)>() && {  // NOLINT
+    // Return an action that first calls the initial action with arguments
+    // filtered through InitialActionArgType, then forwards arguments directly
+    // to the base class to deal with the remaining actions.
+    struct OA {
+      OnceAction<void(InitialActionArgType<Args>...)> initial_action;
+      OnceAction<R(Args...)> remaining_actions;
+
+      R operator()(Args... args) && {
+        std::move(initial_action)
+            .Call(static_cast<InitialActionArgType<Args>>(args)...);
+
+        return std::move(remaining_actions).Call(std::forward<Args>(args)...);
+      }
+    };
 
-  template <typename R, typename... Args>
+    return OA{
+        std::move(initial_action_),
+        std::move(static_cast<Base&>(*this)),
+    };
+  }
+
+  template <
+      typename R, typename... Args,
+      typename std::enable_if<
+          conjunction<
+              // Both the initial action and the rest must support conversion to
+              // Action.
+              std::is_convertible<const InitialAction&,
+                                  Action<void(InitialActionArgType<Args>...)>>,
+              std::is_convertible<const Base&, Action<R(Args...)>>>::value,
+          int>::type = 0>
   operator Action<R(Args...)>() const {  // NOLINT
-    struct Op {
-      std::vector<Action<void(NonFinalType<Args>...)>> converted;
-      Action<R(Args...)> last;
+    // Return an action that first calls the initial action with arguments
+    // filtered through InitialActionArgType, then forwards arguments directly
+    // to the base class to deal with the remaining actions.
+    struct OA {
+      Action<void(InitialActionArgType<Args>...)> initial_action;
+      Action<R(Args...)> remaining_actions;
+
       R operator()(Args... args) const {
-        auto tuple_args = std::forward_as_tuple(std::forward<Args>(args)...);
-        for (auto& a : converted) {
-          a.Perform(tuple_args);
-        }
-        return last.Perform(std::move(tuple_args));
+        initial_action.Perform(std::forward_as_tuple(
+            static_cast<InitialActionArgType<Args>>(args)...));
+
+        return remaining_actions.Perform(
+            std::forward_as_tuple(std::forward<Args>(args)...));
       }
     };
-    return Op{Convert<Action<void(NonFinalType<Args>...)>>(
-                  MakeIndexSequence<sizeof...(Actions) - 1>()),
-              std::get<sizeof...(Actions) - 1>(actions)};
+
+    return OA{
+        initial_action_,
+        static_cast<const Base&>(*this),
+    };
   }
+
+ private:
+  InitialAction initial_action_;
 };
 
 template <typename T, typename... Params>
@@ -1078,10 +1655,11 @@ struct ReturnNewAction {
 
 template <size_t k>
 struct ReturnArgAction {
-  template <typename... Args>
-  auto operator()(const Args&... args) const ->
-      typename std::tuple_element<k, std::tuple<Args...>>::type {
-    return std::get<k>(std::tie(args...));
+  template <typename... Args,
+            typename = typename std::enable_if<(k < sizeof...(Args))>::type>
+  auto operator()(Args&&... args) const -> decltype(std::get<k>(
+      std::forward_as_tuple(std::forward<Args>(args)...))) {
+    return std::get<k>(std::forward_as_tuple(std::forward<Args>(args)...));
   }
 };
 
@@ -1203,7 +1781,8 @@ typedef internal::IgnoredValue Unused;
 template <typename... Action>
 internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
     Action&&... action) {
-  return {std::forward_as_tuple(std::forward<Action>(action)...)};
+  return internal::DoAllAction<typename std::decay<Action>::type...>(
+      {}, std::forward<Action>(action)...);
 }
 
 // WithArg<k>(an_action) creates an action that passes the k-th
@@ -1212,8 +1791,8 @@ internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
 // multiple arguments.  For convenience, we also provide
 // WithArgs<k>(an_action) (defined below) as a synonym.
 template <size_t k, typename InnerAction>
-internal::WithArgsAction<typename std::decay<InnerAction>::type, k>
-WithArg(InnerAction&& action) {
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k> WithArg(
+    InnerAction&& action) {
   return {std::forward<InnerAction>(action)};
 }
 
@@ -1232,14 +1811,35 @@ WithArgs(InnerAction&& action) {
 // argument.  In other words, it adapts an action accepting no
 // argument to one that accepts (and ignores) arguments.
 template <typename InnerAction>
-internal::WithArgsAction<typename std::decay<InnerAction>::type>
-WithoutArgs(InnerAction&& action) {
+internal::WithArgsAction<typename std::decay<InnerAction>::type> WithoutArgs(
+    InnerAction&& action) {
   return {std::forward<InnerAction>(action)};
 }
 
-// Creates an action that returns 'value'.  'value' is passed by value
-// instead of const reference - otherwise Return("string literal")
-// will trigger a compiler error about using array as initializer.
+// Creates an action that returns a value.
+//
+// The returned type can be used with a mock function returning a non-void,
+// non-reference type U as follows:
+//
+//  *  If R is convertible to U and U is move-constructible, then the action can
+//     be used with WillOnce.
+//
+//  *  If const R& is convertible to U and U is copy-constructible, then the
+//     action can be used with both WillOnce and WillRepeatedly.
+//
+// The mock expectation contains the R value from which the U return value is
+// constructed (a move/copy of the argument to Return). This means that the R
+// value will survive at least until the mock object's expectations are cleared
+// or the mock object is destroyed, meaning that U can safely be a
+// reference-like type such as std::string_view:
+//
+//     // The mock function returns a view of a copy of the string fed to
+//     // Return. The view is valid even after the action is performed.
+//     MockFunction<std::string_view()> mock;
+//     EXPECT_CALL(mock, Call).WillOnce(Return(std::string("taco")));
+//     const std::string_view result = mock.AsStdFunction()();
+//     EXPECT_EQ("taco", result);
+//
 template <typename R>
 internal::ReturnAction<R> Return(R value) {
   return internal::ReturnAction<R>(std::move(value));
@@ -1273,6 +1873,8 @@ inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
   return internal::ReturnRefOfCopyAction<R>(x);
 }
 
+// DEPRECATED: use Return(x) directly with WillOnce.
+//
 // Modifies the parent action (a Return() action) to perform a move of the
 // argument instead of a copy.
 // Return(ByMove()) actions can only be executed once and will assert this
@@ -1319,7 +1921,7 @@ internal::SetArgumentPointeeAction<N, T> SetArgumentPointee(T value) {
 
 // Creates an action that sets a pointer referent to a given value.
 template <typename T1, typename T2>
-PolymorphicAction<internal::AssignAction<T1, T2> > Assign(T1* ptr, T2 val) {
+PolymorphicAction<internal::AssignAction<T1, T2>> Assign(T1* ptr, T2 val) {
   return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
 }
 
@@ -1327,8 +1929,8 @@ PolymorphicAction<internal::AssignAction<T1, T2> > Assign(T1* ptr, T2 val) {
 
 // Creates an action that sets errno and returns the appropriate error.
 template <typename T>
-PolymorphicAction<internal::SetErrnoAndReturnAction<T> >
-SetErrnoAndReturn(int errval, T result) {
+PolymorphicAction<internal::SetErrnoAndReturnAction<T>> SetErrnoAndReturn(
+    int errval, T result) {
   return MakePolymorphicAction(
       internal::SetErrnoAndReturnAction<T>(errval, result));
 }
@@ -1482,7 +2084,8 @@ struct ExcessiveArg {};
 
 // Builds an implementation of an Action<> for some particular signature, using
 // a class defined by an ACTION* macro.
-template <typename F, typename Impl> struct ActionImpl;
+template <typename F, typename Impl>
+struct ActionImpl;
 
 template <typename Impl>
 struct ImplBase {
@@ -1502,7 +2105,7 @@ struct ActionImpl<R(Args...), Impl> : ImplBase<Impl>::type {
   using args_type = std::tuple<Args...>;
 
   ActionImpl() = default;  // Only defined if appropriate for Base.
-  explicit ActionImpl(std::shared_ptr<Impl> impl) : Base{std::move(impl)} { }
+  explicit ActionImpl(std::shared_ptr<Impl> impl) : Base{std::move(impl)} {}
 
   R operator()(Args&&... arg) const {
     static constexpr size_t kMaxArgs =
@@ -1521,12 +2124,14 @@ struct ActionImpl<R(Args...), Impl> : ImplBase<Impl>::type {
     // args_type get passed, followed by a dummy of unspecified type for the
     // remainder up to 10 explicit args.
     static constexpr ExcessiveArg kExcessArg{};
-    return static_cast<const Impl&>(*this).template gmock_PerformImpl<
-        /*function_type=*/function_type, /*return_type=*/R,
-        /*args_type=*/args_type,
-        /*argN_type=*/typename std::tuple_element<arg_id, args_type>::type...>(
-        /*args=*/args, std::get<arg_id>(args)...,
-        ((void)excess_id, kExcessArg)...);
+    return static_cast<const Impl&>(*this)
+        .template gmock_PerformImpl<
+            /*function_type=*/function_type, /*return_type=*/R,
+            /*args_type=*/args_type,
+            /*argN_type=*/
+            typename std::tuple_element<arg_id, args_type>::type...>(
+            /*args=*/args, std::get<arg_id>(args)...,
+            ((void)excess_id, kExcessArg)...);
   }
 };
 
@@ -1545,7 +2150,7 @@ template <typename F, typename Impl>
 
 #define GMOCK_INTERNAL_ARG_UNUSED(i, data, el) \
   , const arg##i##_type& arg##i GTEST_ATTRIBUTE_UNUSED_
-#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_           \
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_                 \
   const args_type& args GTEST_ATTRIBUTE_UNUSED_ GMOCK_PP_REPEAT( \
       GMOCK_INTERNAL_ARG_UNUSED, , 10)
 
@@ -1584,42 +2189,47 @@ template <typename F, typename Impl>
 #define GMOCK_ACTION_FIELD_PARAMS_(params) \
   GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_FIELD_PARAM, , params)
 
-#define GMOCK_INTERNAL_ACTION(name, full_name, params)                        \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
-  class full_name {                                                           \
-   public:                                                                    \
-    explicit full_name(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))              \
-        : impl_(std::make_shared<gmock_Impl>(                                 \
-                GMOCK_ACTION_GVALUE_PARAMS_(params))) { }                     \
-    full_name(const full_name&) = default;                                    \
-    full_name(full_name&&) noexcept = default;                                \
-    template <typename F>                                                     \
-    operator ::testing::Action<F>() const {                                   \
-      return ::testing::internal::MakeAction<F>(impl_);                       \
-    }                                                                         \
-   private:                                                                   \
-    class gmock_Impl {                                                        \
-     public:                                                                  \
-      explicit gmock_Impl(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))           \
-          : GMOCK_ACTION_INIT_PARAMS_(params) {}                              \
-      template <typename function_type, typename return_type,                 \
-                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>        \
-      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
-      GMOCK_ACTION_FIELD_PARAMS_(params)                                      \
-    };                                                                        \
-    std::shared_ptr<const gmock_Impl> impl_;                                  \
-  };                                                                          \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
-  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                   \
-      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) {                             \
-    return full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>(                      \
-        GMOCK_ACTION_GVALUE_PARAMS_(params));                                 \
-  }                                                                           \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
-  template <typename function_type, typename return_type, typename args_type, \
-            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                \
-  return_type full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>::gmock_Impl::      \
-  gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+#define GMOCK_INTERNAL_ACTION(name, full_name, params)                         \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
+  class full_name {                                                            \
+   public:                                                                     \
+    explicit full_name(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))               \
+        : impl_(std::make_shared<gmock_Impl>(                                  \
+              GMOCK_ACTION_GVALUE_PARAMS_(params))) {}                         \
+    full_name(const full_name&) = default;                                     \
+    full_name(full_name&&) noexcept = default;                                 \
+    template <typename F>                                                      \
+    operator ::testing::Action<F>() const {                                    \
+      return ::testing::internal::MakeAction<F>(impl_);                        \
+    }                                                                          \
+                                                                               \
+   private:                                                                    \
+    class gmock_Impl {                                                         \
+     public:                                                                   \
+      explicit gmock_Impl(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))            \
+          : GMOCK_ACTION_INIT_PARAMS_(params) {}                               \
+      template <typename function_type, typename return_type,                  \
+                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>         \
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;  \
+      GMOCK_ACTION_FIELD_PARAMS_(params)                                       \
+    };                                                                         \
+    std::shared_ptr<const gmock_Impl> impl_;                                   \
+  };                                                                           \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
+  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                    \
+      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) GTEST_MUST_USE_RESULT_;        \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
+  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                    \
+      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) {                              \
+    return full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>(                       \
+        GMOCK_ACTION_GVALUE_PARAMS_(params));                                  \
+  }                                                                            \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
+  template <typename function_type, typename return_type, typename args_type,  \
+            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                 \
+  return_type                                                                  \
+  full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>::gmock_Impl::gmock_PerformImpl( \
+      GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
 
 }  // namespace internal
 
@@ -1627,12 +2237,13 @@ template <typename F, typename Impl>
 #define ACTION(name)                                                          \
   class name##Action {                                                        \
    public:                                                                    \
-   explicit name##Action() noexcept {}                                        \
-   name##Action(const name##Action&) noexcept {}                              \
+    explicit name##Action() noexcept {}                                       \
+    name##Action(const name##Action&) noexcept {}                             \
     template <typename F>                                                     \
     operator ::testing::Action<F>() const {                                   \
       return ::testing::internal::MakeAction<F, gmock_Impl>();                \
     }                                                                         \
+                                                                              \
    private:                                                                   \
     class gmock_Impl {                                                        \
      public:                                                                  \
@@ -1681,7 +2292,7 @@ template <typename F, typename Impl>
 }  // namespace testing
 
 #ifdef _MSC_VER
-# pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 #endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h b/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
index fc7f803a7..b6ab648e5 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
@@ -27,21 +27,23 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements some commonly used cardinalities.  More
 // cardinalities can be defined by the user implementing the
 // CardinalityInterface interface if necessary.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
 
 #include <limits.h>
+
 #include <memory>
 #include <ostream>  // NOLINT
+
 #include "gmock/internal/gmock-port.h"
 #include "gtest/gtest.h"
 
@@ -116,7 +118,7 @@ class GTEST_API_ Cardinality {
   // cardinality, i.e. exceed the maximum number of allowed calls.
   bool IsOverSaturatedByCallCount(int call_count) const {
     return impl_->IsSaturatedByCallCount(call_count) &&
-        !impl_->IsSatisfiedByCallCount(call_count);
+           !impl_->IsSatisfiedByCallCount(call_count);
   }
 
   // Describes self to an ostream
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h b/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
index 0fc6f6f3f..f565d980c 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
@@ -31,7 +31,8 @@
 //
 // This file implements MOCK_METHOD.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
 #define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
@@ -64,6 +65,39 @@ struct ThisRefAdjuster {
   }
 };
 
+constexpr bool PrefixOf(const char* a, const char* b) {
+  return *a == 0 || (*a == *b && internal::PrefixOf(a + 1, b + 1));
+}
+
+template <int N, int M>
+constexpr bool StartsWith(const char (&prefix)[N], const char (&str)[M]) {
+  return N <= M && internal::PrefixOf(prefix, str);
+}
+
+template <int N, int M>
+constexpr bool EndsWith(const char (&suffix)[N], const char (&str)[M]) {
+  return N <= M && internal::PrefixOf(suffix, str + M - N);
+}
+
+template <int N, int M>
+constexpr bool Equals(const char (&a)[N], const char (&b)[M]) {
+  return N == M && internal::PrefixOf(a, b);
+}
+
+template <int N>
+constexpr bool ValidateSpec(const char (&spec)[N]) {
+  return internal::Equals("const", spec) ||
+         internal::Equals("override", spec) ||
+         internal::Equals("final", spec) ||
+         internal::Equals("noexcept", spec) ||
+         (internal::StartsWith("noexcept(", spec) &&
+          internal::EndsWith(")", spec)) ||
+         internal::Equals("ref(&)", spec) ||
+         internal::Equals("ref(&&)", spec) ||
+         (internal::StartsWith("Calltype(", spec) &&
+          internal::EndsWith(")", spec));
+}
+
 }  // namespace internal
 
 // The style guide prohibits "using" statements in a namespace scope
@@ -86,17 +120,18 @@ using internal::FunctionMocker;
 #define GMOCK_INTERNAL_MOCK_METHOD_ARG_3(_Ret, _MethodName, _Args) \
   GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, ())
 
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec)     \
-  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args);                                   \
-  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec);                                   \
-  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                      \
-      GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args));           \
-  GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec)                                     \
-  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                            \
-      GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec),     \
-      GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec),    \
-      GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Spec),                                \
-      GMOCK_INTERNAL_GET_CALLTYPE(_Spec), GMOCK_INTERNAL_GET_REF_SPEC(_Spec), \
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec)  \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args);                                \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec);                                \
+  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                   \
+      GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args));        \
+  GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec)                                  \
+  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                         \
+      GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec),  \
+      GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec), \
+      GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Spec),                             \
+      GMOCK_INTERNAL_GET_CALLTYPE_SPEC(_Spec),                             \
+      GMOCK_INTERNAL_GET_REF_SPEC(_Spec),                                  \
       (GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)))
 
 #define GMOCK_INTERNAL_MOCK_METHOD_ARG_5(...) \
@@ -166,11 +201,11 @@ using internal::FunctionMocker;
             GMOCK_INTERNAL_A_MATCHER_ARGUMENT, _Signature, _N));               \
   }                                                                            \
   mutable ::testing::FunctionMocker<GMOCK_PP_REMOVE_PARENS(_Signature)>        \
-      GMOCK_MOCKER_(_N, _Constness, _MethodName)
+  GMOCK_MOCKER_(_N, _Constness, _MethodName)
 
 #define GMOCK_INTERNAL_EXPAND(...) __VA_ARGS__
 
-// Five Valid modifiers.
+// Valid modifiers.
 #define GMOCK_INTERNAL_HAS_CONST(_Tuple) \
   GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_CONST, ~, _Tuple))
 
@@ -189,6 +224,14 @@ using internal::FunctionMocker;
       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)), \
       _elem, )
 
+#define GMOCK_INTERNAL_GET_CALLTYPE_SPEC(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_CALLTYPE_SPEC_IF_CALLTYPE, ~, _Tuple)
+
+#define GMOCK_INTERNAL_CALLTYPE_SPEC_IF_CALLTYPE(_i, _, _elem)          \
+  GMOCK_PP_IF(                                                          \
+      GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem)), \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
+
 #define GMOCK_INTERNAL_GET_REF_SPEC(_Tuple) \
   GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_REF_SPEC_IF_REF, ~, _Tuple)
 
@@ -196,19 +239,25 @@ using internal::FunctionMocker;
   GMOCK_PP_IF(GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)), \
               GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
 
-#define GMOCK_INTERNAL_GET_CALLTYPE(_Tuple) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_CALLTYPE_IMPL, ~, _Tuple)
-
-#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem)            \
-  static_assert(                                                          \
-      (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) +    \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) + \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) +    \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) + \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)) +      \
-       GMOCK_INTERNAL_IS_CALLTYPE(_elem)) == 1,                           \
-      GMOCK_PP_STRINGIZE(                                                 \
+#ifdef GMOCK_INTERNAL_STRICT_SPEC_ASSERT
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem) \
+  static_assert(                                                     \
+      ::testing::internal::ValidateSpec(GMOCK_PP_STRINGIZE(_elem)),  \
+      "Token \'" GMOCK_PP_STRINGIZE(                                 \
+          _elem) "\' cannot be recognized as a valid specification " \
+                 "modifier. Is a ',' missing?");
+#else
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem)                 \
+  static_assert(                                                               \
+      (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) +         \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) +      \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) +         \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) +      \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)) +           \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem))) == 1, \
+      GMOCK_PP_STRINGIZE(                                                      \
           _elem) " cannot be recognized as a valid specification modifier.");
+#endif  // GMOCK_INTERNAL_STRICT_SPEC_ASSERT
 
 // Modifiers implementation.
 #define GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem) \
@@ -238,26 +287,12 @@ using internal::FunctionMocker;
 
 #define GMOCK_INTERNAL_UNPACK_ref(x) x
 
-#define GMOCK_INTERNAL_GET_CALLTYPE_IMPL(_i, _, _elem)           \
-  GMOCK_PP_IF(GMOCK_INTERNAL_IS_CALLTYPE(_elem),                 \
-              GMOCK_INTERNAL_GET_VALUE_CALLTYPE, GMOCK_PP_EMPTY) \
-  (_elem)
+#define GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CALLTYPE_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_CALLTYPE_I_Calltype ,
 
-// TODO(iserna): GMOCK_INTERNAL_IS_CALLTYPE and
-// GMOCK_INTERNAL_GET_VALUE_CALLTYPE needed more expansions to work on windows
-// maybe they can be simplified somehow.
-#define GMOCK_INTERNAL_IS_CALLTYPE(_arg) \
-  GMOCK_INTERNAL_IS_CALLTYPE_I(          \
-      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
-#define GMOCK_INTERNAL_IS_CALLTYPE_I(_arg) GMOCK_PP_IS_ENCLOSED_PARENS(_arg)
-
-#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE(_arg) \
-  GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(          \
-      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
-#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(_arg) \
-  GMOCK_PP_IDENTITY _arg
-
-#define GMOCK_INTERNAL_IS_CALLTYPE_HELPER_Calltype
+#define GMOCK_INTERNAL_UNPACK_Calltype(...) __VA_ARGS__
 
 // Note: The use of `identity_t` here allows _Ret to represent return types that
 // would normally need to be specified in a different way. For example, a method
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h b/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
index 86be9c176..628290114 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // The MATCHER* family of macros can be used in a namespace scope to
@@ -250,7 +249,8 @@
 // See googletest/include/gtest/gtest-matchers.h for the definition of class
 // Matcher, class MatcherInterface, and others.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
@@ -313,7 +313,9 @@ class StringMatchResultListener : public MatchResultListener {
  private:
   ::std::stringstream ss_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StringMatchResultListener);
+  StringMatchResultListener(const StringMatchResultListener&) = delete;
+  StringMatchResultListener& operator=(const StringMatchResultListener&) =
+      delete;
 };
 
 // Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
@@ -396,7 +398,7 @@ class MatcherCastImpl {
 // is already a Matcher.  This only compiles when type T can be
 // statically converted to type U.
 template <typename T, typename U>
-class MatcherCastImpl<T, Matcher<U> > {
+class MatcherCastImpl<T, Matcher<U>> {
  public:
   static Matcher<T> Cast(const Matcher<U>& source_matcher) {
     return Matcher<T>(new Impl(source_matcher));
@@ -450,7 +452,7 @@ class MatcherCastImpl<T, Matcher<U> > {
 // This even more specialized version is used for efficiently casting
 // a matcher to its own type.
 template <typename T>
-class MatcherCastImpl<T, Matcher<T> > {
+class MatcherCastImpl<T, Matcher<T>> {
  public:
   static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
 };
@@ -533,19 +535,18 @@ inline Matcher<T> SafeMatcherCast(const Matcher<U>& matcher) {
                 "T must be implicitly convertible to U");
   // Enforce that we are not converting a non-reference type T to a reference
   // type U.
-  GTEST_COMPILE_ASSERT_(
-      std::is_reference<T>::value || !std::is_reference<U>::value,
-      cannot_convert_non_reference_arg_to_reference);
+  static_assert(std::is_reference<T>::value || !std::is_reference<U>::value,
+                "cannot convert non reference arg to reference");
   // In case both T and U are arithmetic types, enforce that the
   // conversion is not lossy.
   typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
   typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
   constexpr bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
   constexpr bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
-  GTEST_COMPILE_ASSERT_(
+  static_assert(
       kTIsOther || kUIsOther ||
-      (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
-      conversion_of_arithmetic_types_must_be_lossless);
+          (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
+      "conversion of arithmetic types must be lossless");
   return MatcherCast<T>(matcher);
 }
 
@@ -678,9 +679,9 @@ bool TupleMatches(const MatcherTuple& matcher_tuple,
                   const ValueTuple& value_tuple) {
   // Makes sure that matcher_tuple and value_tuple have the same
   // number of fields.
-  GTEST_COMPILE_ASSERT_(std::tuple_size<MatcherTuple>::value ==
-                            std::tuple_size<ValueTuple>::value,
-                        matcher_and_value_have_different_numbers_of_fields);
+  static_assert(std::tuple_size<MatcherTuple>::value ==
+                    std::tuple_size<ValueTuple>::value,
+                "matcher and value have different numbers of fields");
   return TuplePrefix<std::tuple_size<ValueTuple>::value>::Matches(matcher_tuple,
                                                                   value_tuple);
 }
@@ -689,8 +690,7 @@ bool TupleMatches(const MatcherTuple& matcher_tuple,
 // is no failure, nothing will be streamed to os.
 template <typename MatcherTuple, typename ValueTuple>
 void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
-                                const ValueTuple& values,
-                                ::std::ostream* os) {
+                                const ValueTuple& values, ::std::ostream* os) {
   TuplePrefix<std::tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
       matchers, values, os);
 }
@@ -714,14 +714,14 @@ class TransformTupleValuesHelper {
  private:
   template <typename Tup, size_t kRemainingSize>
   struct IterateOverTuple {
-    OutIter operator() (Func f, const Tup& t, OutIter out) const {
+    OutIter operator()(Func f, const Tup& t, OutIter out) const {
       *out++ = f(::std::get<TupleSize::value - kRemainingSize>(t));
       return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
     }
   };
   template <typename Tup>
   struct IterateOverTuple<Tup, 0> {
-    OutIter operator() (Func /* f */, const Tup& /* t */, OutIter out) const {
+    OutIter operator()(Func /* f */, const Tup& /* t */, OutIter out) const {
       return out;
     }
   };
@@ -767,9 +767,7 @@ class IsNullMatcher {
   }
 
   void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "isn't NULL";
-  }
+  void DescribeNegationTo(::std::ostream* os) const { *os << "isn't NULL"; }
 };
 
 // Implements the polymorphic NotNull() matcher, which matches any raw or smart
@@ -783,9 +781,7 @@ class NotNullMatcher {
   }
 
   void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "is NULL";
-  }
+  void DescribeNegationTo(::std::ostream* os) const { *os << "is NULL"; }
 };
 
 // Ref(variable) matches any argument that is a reference to
@@ -871,8 +867,7 @@ inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
 // String comparison for narrow or wide strings that can have embedded NUL
 // characters.
 template <typename StringType>
-bool CaseInsensitiveStringEquals(const StringType& s1,
-                                 const StringType& s2) {
+bool CaseInsensitiveStringEquals(const StringType& s1, const StringType& s2) {
   // Are the heads equal?
   if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
     return false;
@@ -933,8 +928,8 @@ class StrEqualityMatcher {
   bool MatchAndExplain(const MatcheeStringType& s,
                        MatchResultListener* /* listener */) const {
     const StringType s2(s);
-    const bool eq = case_sensitive_ ? s2 == string_ :
-        CaseInsensitiveStringEquals(s2, string_);
+    const bool eq = case_sensitive_ ? s2 == string_
+                                    : CaseInsensitiveStringEquals(s2, string_);
     return expect_eq_ == eq;
   }
 
@@ -1021,8 +1016,7 @@ class HasSubstrMatcher {
 template <typename StringType>
 class StartsWithMatcher {
  public:
-  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {
-  }
+  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {}
 
 #if GTEST_INTERNAL_HAS_STRING_VIEW
   bool MatchAndExplain(const internal::StringView& s,
@@ -1053,7 +1047,7 @@ class StartsWithMatcher {
                        MatchResultListener* /* listener */) const {
     const StringType& s2(s);
     return s2.length() >= prefix_.length() &&
-        s2.substr(0, prefix_.length()) == prefix_;
+           s2.substr(0, prefix_.length()) == prefix_;
   }
 
   void DescribeTo(::std::ostream* os) const {
@@ -1107,7 +1101,7 @@ class EndsWithMatcher {
                        MatchResultListener* /* listener */) const {
     const StringType& s2(s);
     return s2.length() >= suffix_.length() &&
-        s2.substr(s2.length() - suffix_.length()) == suffix_;
+           s2.substr(s2.length() - suffix_.length()) == suffix_;
   }
 
   void DescribeTo(::std::ostream* os) const {
@@ -1124,6 +1118,45 @@ class EndsWithMatcher {
   const StringType suffix_;
 };
 
+// Implements the polymorphic WhenBase64Unescaped(matcher) matcher, which can be
+// used as a Matcher<T> as long as T can be converted to a string.
+class WhenBase64UnescapedMatcher {
+ public:
+  using is_gtest_matcher = void;
+
+  explicit WhenBase64UnescapedMatcher(
+      const Matcher<const std::string&>& internal_matcher)
+      : internal_matcher_(internal_matcher) {}
+
+  // Matches anything that can convert to std::string.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* listener) const {
+    const std::string s2(s);  // NOLINT (needed for working with string_view).
+    std::string unescaped;
+    if (!internal::Base64Unescape(s2, &unescaped)) {
+      if (listener != nullptr) {
+        *listener << "is not a valid base64 escaped string";
+      }
+      return false;
+    }
+    return MatchPrintAndExplain(unescaped, internal_matcher_, listener);
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "matches after Base64Unescape ";
+    internal_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "does not match after Base64Unescape ";
+    internal_matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<const std::string&> internal_matcher_;
+};
+
 // Implements a matcher that compares the two fields of a 2-tuple
 // using one of the ==, <=, <, etc, operators.  The two fields being
 // compared don't have to have the same type.
@@ -1197,8 +1230,7 @@ class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
 template <typename T>
 class NotMatcherImpl : public MatcherInterface<const T&> {
  public:
-  explicit NotMatcherImpl(const Matcher<T>& matcher)
-      : matcher_(matcher) {}
+  explicit NotMatcherImpl(const Matcher<T>& matcher) : matcher_(matcher) {}
 
   bool MatchAndExplain(const T& x,
                        MatchResultListener* listener) const override {
@@ -1242,7 +1274,7 @@ class NotMatcher {
 template <typename T>
 class AllOfMatcherImpl : public MatcherInterface<const T&> {
  public:
-  explicit AllOfMatcherImpl(std::vector<Matcher<T> > matchers)
+  explicit AllOfMatcherImpl(std::vector<Matcher<T>> matchers)
       : matchers_(std::move(matchers)) {}
 
   void DescribeTo(::std::ostream* os) const override {
@@ -1293,7 +1325,7 @@ class AllOfMatcherImpl : public MatcherInterface<const T&> {
   }
 
  private:
-  const std::vector<Matcher<T> > matchers_;
+  const std::vector<Matcher<T>> matchers_;
 };
 
 // VariadicMatcher is used for the variadic implementation of
@@ -1316,14 +1348,14 @@ class VariadicMatcher {
   // all of the provided matchers (Matcher1, Matcher2, ...) can match.
   template <typename T>
   operator Matcher<T>() const {
-    std::vector<Matcher<T> > values;
+    std::vector<Matcher<T>> values;
     CreateVariadicMatcher<T>(&values, std::integral_constant<size_t, 0>());
     return Matcher<T>(new CombiningMatcher<T>(std::move(values)));
   }
 
  private:
   template <typename T, size_t I>
-  void CreateVariadicMatcher(std::vector<Matcher<T> >* values,
+  void CreateVariadicMatcher(std::vector<Matcher<T>>* values,
                              std::integral_constant<size_t, I>) const {
     values->push_back(SafeMatcherCast<T>(std::get<I>(matchers_)));
     CreateVariadicMatcher<T>(values, std::integral_constant<size_t, I + 1>());
@@ -1331,7 +1363,7 @@ class VariadicMatcher {
 
   template <typename T>
   void CreateVariadicMatcher(
-      std::vector<Matcher<T> >*,
+      std::vector<Matcher<T>>*,
       std::integral_constant<size_t, sizeof...(Args)>) const {}
 
   std::tuple<Args...> matchers_;
@@ -1347,7 +1379,7 @@ using AllOfMatcher = VariadicMatcher<AllOfMatcherImpl, Args...>;
 template <typename T>
 class AnyOfMatcherImpl : public MatcherInterface<const T&> {
  public:
-  explicit AnyOfMatcherImpl(std::vector<Matcher<T> > matchers)
+  explicit AnyOfMatcherImpl(std::vector<Matcher<T>> matchers)
       : matchers_(std::move(matchers)) {}
 
   void DescribeTo(::std::ostream* os) const override {
@@ -1398,13 +1430,35 @@ class AnyOfMatcherImpl : public MatcherInterface<const T&> {
   }
 
  private:
-  const std::vector<Matcher<T> > matchers_;
+  const std::vector<Matcher<T>> matchers_;
 };
 
 // AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
 template <typename... Args>
 using AnyOfMatcher = VariadicMatcher<AnyOfMatcherImpl, Args...>;
 
+// ConditionalMatcher is the implementation of Conditional(cond, m1, m2)
+template <typename MatcherTrue, typename MatcherFalse>
+class ConditionalMatcher {
+ public:
+  ConditionalMatcher(bool condition, MatcherTrue matcher_true,
+                     MatcherFalse matcher_false)
+      : condition_(condition),
+        matcher_true_(std::move(matcher_true)),
+        matcher_false_(std::move(matcher_false)) {}
+
+  template <typename T>
+  operator Matcher<T>() const {  // NOLINT(runtime/explicit)
+    return condition_ ? SafeMatcherCast<T>(matcher_true_)
+                      : SafeMatcherCast<T>(matcher_false_);
+  }
+
+ private:
+  bool condition_;
+  MatcherTrue matcher_true_;
+  MatcherFalse matcher_false_;
+};
+
 // Wrapper for implementation of Any/AllOfArray().
 template <template <class> class MatcherImpl, typename T>
 class SomeOfArrayMatcher {
@@ -1454,8 +1508,7 @@ class TrulyMatcher {
     // We cannot write 'return !!predicate_(x);' as that doesn't work
     // when predicate_(x) returns a class convertible to bool but
     // having no operator!().
-    if (predicate_(x))
-      return true;
+    if (predicate_(x)) return true;
     *listener << "didn't satisfy the given predicate";
     return false;
   }
@@ -1563,8 +1616,8 @@ class PredicateFormatterFromMatcher {
 // used for implementing ASSERT_THAT() and EXPECT_THAT().
 // Implementation detail: 'matcher' is received by-value to force decaying.
 template <typename M>
-inline PredicateFormatterFromMatcher<M>
-MakePredicateFormatterFromMatcher(M matcher) {
+inline PredicateFormatterFromMatcher<M> MakePredicateFormatterFromMatcher(
+    M matcher) {
   return PredicateFormatterFromMatcher<M>(std::move(matcher));
 }
 
@@ -1579,9 +1632,7 @@ class IsNanMatcher {
   }
 
   void DescribeTo(::std::ostream* os) const { *os << "is NaN"; }
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "isn't NaN";
-  }
+  void DescribeNegationTo(::std::ostream* os) const { *os << "isn't NaN"; }
 };
 
 // Implements the polymorphic floating point equality matcher, which matches
@@ -1597,9 +1648,8 @@ class FloatingEqMatcher {
   // equality comparisons between NANs will always return false.  We specify a
   // negative max_abs_error_ term to indicate that ULP-based approximation will
   // be used for comparison.
-  FloatingEqMatcher(FloatType expected, bool nan_eq_nan) :
-    expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {
-  }
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan)
+      : expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {}
 
   // Constructor that supports a user-specified max_abs_error that will be used
   // for comparison instead of ULP-based approximation.  The max absolute
@@ -1661,8 +1711,8 @@ class FloatingEqMatcher {
       // os->precision() returns the previously set precision, which we
       // store to restore the ostream to its original configuration
       // after outputting.
-      const ::std::streamsize old_precision = os->precision(
-          ::std::numeric_limits<FloatType>::digits10 + 2);
+      const ::std::streamsize old_precision =
+          os->precision(::std::numeric_limits<FloatType>::digits10 + 2);
       if (FloatingPoint<FloatType>(expected_).is_nan()) {
         if (nan_eq_nan_) {
           *os << "is NaN";
@@ -1680,8 +1730,8 @@ class FloatingEqMatcher {
 
     void DescribeNegationTo(::std::ostream* os) const override {
       // As before, get original precision.
-      const ::std::streamsize old_precision = os->precision(
-          ::std::numeric_limits<FloatType>::digits10 + 2);
+      const ::std::streamsize old_precision =
+          os->precision(::std::numeric_limits<FloatType>::digits10 + 2);
       if (FloatingPoint<FloatType>(expected_).is_nan()) {
         if (nan_eq_nan_) {
           *os << "isn't NaN";
@@ -1699,9 +1749,7 @@ class FloatingEqMatcher {
     }
 
    private:
-    bool HasMaxAbsError() const {
-      return max_abs_error_ >= 0;
-    }
+    bool HasMaxAbsError() const { return max_abs_error_ >= 0; }
 
     const FloatType expected_;
     const bool nan_eq_nan_;
@@ -1773,9 +1821,8 @@ class FloatingEq2Matcher {
   template <typename Tuple>
   class Impl : public MatcherInterface<Tuple> {
    public:
-    Impl(FloatType max_abs_error, bool nan_eq_nan) :
-        max_abs_error_(max_abs_error),
-        nan_eq_nan_(nan_eq_nan) {}
+    Impl(FloatType max_abs_error, bool nan_eq_nan)
+        : max_abs_error_(max_abs_error), nan_eq_nan_(nan_eq_nan) {}
 
     bool MatchAndExplain(Tuple args,
                          MatchResultListener* listener) const override {
@@ -1951,9 +1998,7 @@ class WhenDynamicCastToMatcherBase {
  protected:
   const Matcher<To> matcher_;
 
-  static std::string GetToName() {
-    return GetTypeName<To>();
-  }
+  static std::string GetToName() { return GetTypeName<To>(); }
 
  private:
   static void GetCastTypeDescription(::std::ostream* os) {
@@ -2090,7 +2135,7 @@ class PropertyMatcher {
   }
 
   template <typename T>
-  bool MatchAndExplain(const T&value, MatchResultListener* listener) const {
+  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
     return MatchAndExplainImpl(
         typename std::is_pointer<typename std::remove_const<T>::type>::type(),
         value, listener);
@@ -2142,16 +2187,16 @@ struct CallableTraits {
 
 // Specialization for function pointers.
 template <typename ArgType, typename ResType>
-struct CallableTraits<ResType(*)(ArgType)> {
+struct CallableTraits<ResType (*)(ArgType)> {
   typedef ResType ResultType;
-  typedef ResType(*StorageType)(ArgType);
+  typedef ResType (*StorageType)(ArgType);
 
-  static void CheckIsValid(ResType(*f)(ArgType)) {
+  static void CheckIsValid(ResType (*f)(ArgType)) {
     GTEST_CHECK_(f != nullptr)
         << "NULL function pointer is passed into ResultOf().";
   }
   template <typename T>
-  static ResType Invoke(ResType(*f)(ArgType), T arg) {
+  static ResType Invoke(ResType (*f)(ArgType), T arg) {
     return (*f)(arg);
   }
 };
@@ -2162,13 +2207,21 @@ template <typename Callable, typename InnerMatcher>
 class ResultOfMatcher {
  public:
   ResultOfMatcher(Callable callable, InnerMatcher matcher)
-      : callable_(std::move(callable)), matcher_(std::move(matcher)) {
+      : ResultOfMatcher(/*result_description=*/"", std::move(callable),
+                        std::move(matcher)) {}
+
+  ResultOfMatcher(const std::string& result_description, Callable callable,
+                  InnerMatcher matcher)
+      : result_description_(result_description),
+        callable_(std::move(callable)),
+        matcher_(std::move(matcher)) {
     CallableTraits<Callable>::CheckIsValid(callable_);
   }
 
   template <typename T>
   operator Matcher<T>() const {
-    return Matcher<T>(new Impl<const T&>(callable_, matcher_));
+    return Matcher<T>(
+        new Impl<const T&>(result_description_, callable_, matcher_));
   }
 
  private:
@@ -2181,21 +2234,36 @@ class ResultOfMatcher {
 
    public:
     template <typename M>
-    Impl(const CallableStorageType& callable, const M& matcher)
-        : callable_(callable), matcher_(MatcherCast<ResultType>(matcher)) {}
+    Impl(const std::string& result_description,
+         const CallableStorageType& callable, const M& matcher)
+        : result_description_(result_description),
+          callable_(callable),
+          matcher_(MatcherCast<ResultType>(matcher)) {}
 
     void DescribeTo(::std::ostream* os) const override {
-      *os << "is mapped by the given callable to a value that ";
+      if (result_description_.empty()) {
+        *os << "is mapped by the given callable to a value that ";
+      } else {
+        *os << "whose " << result_description_ << " ";
+      }
       matcher_.DescribeTo(os);
     }
 
     void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "is mapped by the given callable to a value that ";
+      if (result_description_.empty()) {
+        *os << "is mapped by the given callable to a value that ";
+      } else {
+        *os << "whose " << result_description_ << " ";
+      }
       matcher_.DescribeNegationTo(os);
     }
 
     bool MatchAndExplain(T obj, MatchResultListener* listener) const override {
-      *listener << "which is mapped by the given callable to ";
+      if (result_description_.empty()) {
+        *listener << "which is mapped by the given callable to ";
+      } else {
+        *listener << "whose " << result_description_ << " is ";
+      }
       // Cannot pass the return value directly to MatchPrintAndExplain, which
       // takes a non-const reference as argument.
       // Also, specifying template argument explicitly is needed because T could
@@ -2206,6 +2274,7 @@ class ResultOfMatcher {
     }
 
    private:
+    const std::string result_description_;
     // Functors often define operator() as non-const method even though
     // they are actually stateless. But we need to use them even when
     // 'this' is a const pointer. It's the user's responsibility not to
@@ -2215,6 +2284,7 @@ class ResultOfMatcher {
     const Matcher<ResultType> matcher_;
   };  // class Impl
 
+  const std::string result_description_;
   const CallableStorageType callable_;
   const InnerMatcher matcher_;
 };
@@ -2224,8 +2294,7 @@ template <typename SizeMatcher>
 class SizeIsMatcher {
  public:
   explicit SizeIsMatcher(const SizeMatcher& size_matcher)
-       : size_matcher_(size_matcher) {
-  }
+      : size_matcher_(size_matcher) {}
 
   template <typename Container>
   operator Matcher<Container>() const {
@@ -2253,8 +2322,8 @@ class SizeIsMatcher {
       SizeType size = container.size();
       StringMatchResultListener size_listener;
       const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
-      *listener
-          << "whose size " << size << (result ? " matches" : " doesn't match");
+      *listener << "whose size " << size
+                << (result ? " matches" : " doesn't match");
       PrintIfNotEmpty(size_listener.str(), listener->stream());
       return result;
     }
@@ -2283,8 +2352,9 @@ class BeginEndDistanceIsMatcher {
   template <typename Container>
   class Impl : public MatcherInterface<Container> {
    public:
-    typedef internal::StlContainerView<
-        GTEST_REMOVE_REFERENCE_AND_CONST_(Container)> ContainerView;
+    typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
+        Container)>
+        ContainerView;
     typedef typename std::iterator_traits<
         typename ContainerView::type::const_iterator>::difference_type
         DistanceType;
@@ -2364,18 +2434,15 @@ class ContainerEqMatcher {
     typedef internal::StlContainerView<
         typename std::remove_const<LhsContainer>::type>
         LhsView;
-    typedef typename LhsView::type LhsStlContainer;
     StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
-    if (lhs_stl_container == expected_)
-      return true;
+    if (lhs_stl_container == expected_) return true;
 
     ::std::ostream* const os = listener->stream();
     if (os != nullptr) {
       // Something is different. Check for extra values first.
       bool printed_header = false;
-      for (typename LhsStlContainer::const_iterator it =
-               lhs_stl_container.begin();
-           it != lhs_stl_container.end(); ++it) {
+      for (auto it = lhs_stl_container.begin(); it != lhs_stl_container.end();
+           ++it) {
         if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
             expected_.end()) {
           if (printed_header) {
@@ -2390,11 +2457,10 @@ class ContainerEqMatcher {
 
       // Now check for missing values.
       bool printed_header2 = false;
-      for (typename StlContainer::const_iterator it = expected_.begin();
-           it != expected_.end(); ++it) {
-        if (internal::ArrayAwareFind(
-                lhs_stl_container.begin(), lhs_stl_container.end(), *it) ==
-            lhs_stl_container.end()) {
+      for (auto it = expected_.begin(); it != expected_.end(); ++it) {
+        if (internal::ArrayAwareFind(lhs_stl_container.begin(),
+                                     lhs_stl_container.end(),
+                                     *it) == lhs_stl_container.end()) {
           if (printed_header2) {
             *os << ", ";
           } else {
@@ -2417,7 +2483,9 @@ class ContainerEqMatcher {
 // A comparator functor that uses the < operator to compare two values.
 struct LessComparator {
   template <typename T, typename U>
-  bool operator()(const T& lhs, const U& rhs) const { return lhs < rhs; }
+  bool operator()(const T& lhs, const U& rhs) const {
+    return lhs < rhs;
+  }
 };
 
 // Implements WhenSortedBy(comparator, container_matcher).
@@ -2436,14 +2504,16 @@ class WhenSortedByMatcher {
   template <typename LhsContainer>
   class Impl : public MatcherInterface<LhsContainer> {
    public:
-    typedef internal::StlContainerView<
-         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
+        LhsContainer)>
+        LhsView;
     typedef typename LhsView::type LhsStlContainer;
     typedef typename LhsView::const_reference LhsStlContainerReference;
     // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
     // so that we can match associative containers.
-    typedef typename RemoveConstFromKey<
-        typename LhsStlContainer::value_type>::type LhsValue;
+    typedef
+        typename RemoveConstFromKey<typename LhsStlContainer::value_type>::type
+            LhsValue;
 
     Impl(const Comparator& comparator, const ContainerMatcher& matcher)
         : comparator_(comparator), matcher_(matcher) {}
@@ -2463,8 +2533,8 @@ class WhenSortedByMatcher {
       LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
       ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
                                                lhs_stl_container.end());
-      ::std::sort(
-           sorted_container.begin(), sorted_container.end(), comparator_);
+      ::std::sort(sorted_container.begin(), sorted_container.end(),
+                  comparator_);
 
       if (!listener->IsInterested()) {
         // If the listener is not interested, we do not need to
@@ -2477,8 +2547,8 @@ class WhenSortedByMatcher {
       *listener << " when sorted";
 
       StringMatchResultListener inner_listener;
-      const bool match = matcher_.MatchAndExplain(sorted_container,
-                                                  &inner_listener);
+      const bool match =
+          matcher_.MatchAndExplain(sorted_container, &inner_listener);
       PrintIfNotEmpty(inner_listener.str(), listener->stream());
       return match;
     }
@@ -2487,7 +2557,8 @@ class WhenSortedByMatcher {
     const Comparator comparator_;
     const Matcher<const ::std::vector<LhsValue>&> matcher_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+    Impl(const Impl&) = delete;
+    Impl& operator=(const Impl&) = delete;
   };
 
  private:
@@ -2501,9 +2572,9 @@ class WhenSortedByMatcher {
 // container and the RHS container respectively.
 template <typename TupleMatcher, typename RhsContainer>
 class PointwiseMatcher {
-  GTEST_COMPILE_ASSERT_(
+  static_assert(
       !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>::value,
-      use_UnorderedPointwise_with_hash_tables);
+      "use UnorderedPointwise with hash tables");
 
  public:
   typedef internal::StlContainerView<RhsContainer> RhsView;
@@ -2522,9 +2593,9 @@ class PointwiseMatcher {
 
   template <typename LhsContainer>
   operator Matcher<LhsContainer>() const {
-    GTEST_COMPILE_ASSERT_(
+    static_assert(
         !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)>::value,
-        use_UnorderedPointwise_with_hash_tables);
+        "use UnorderedPointwise with hash tables");
 
     return Matcher<LhsContainer>(
         new Impl<const LhsContainer&>(tuple_matcher_, rhs_));
@@ -2533,8 +2604,9 @@ class PointwiseMatcher {
   template <typename LhsContainer>
   class Impl : public MatcherInterface<LhsContainer> {
    public:
-    typedef internal::StlContainerView<
-         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
+        LhsContainer)>
+        LhsView;
     typedef typename LhsView::type LhsStlContainer;
     typedef typename LhsView::const_reference LhsStlContainerReference;
     typedef typename LhsStlContainer::value_type LhsValue;
@@ -2574,14 +2646,14 @@ class PointwiseMatcher {
         return false;
       }
 
-      typename LhsStlContainer::const_iterator left = lhs_stl_container.begin();
-      typename RhsStlContainer::const_iterator right = rhs_.begin();
+      auto left = lhs_stl_container.begin();
+      auto right = rhs_.begin();
       for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
         if (listener->IsInterested()) {
           StringMatchResultListener inner_listener;
           // Create InnerMatcherArg as a temporarily object to avoid it outlives
           // *left and *right. Dereference or the conversion to `const T&` may
-          // return temp objects, e.g for vector<bool>.
+          // return temp objects, e.g. for vector<bool>.
           if (!mono_tuple_matcher_.MatchAndExplain(
                   InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
                                   ImplicitCast_<const RhsValue&>(*right)),
@@ -2628,18 +2700,17 @@ class QuantifierMatcherImpl : public MatcherInterface<Container> {
   template <typename InnerMatcher>
   explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
       : inner_matcher_(
-           testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
+            testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
 
   // Checks whether:
   // * All elements in the container match, if all_elements_should_match.
   // * Any element in the container matches, if !all_elements_should_match.
-  bool MatchAndExplainImpl(bool all_elements_should_match,
-                           Container container,
+  bool MatchAndExplainImpl(bool all_elements_should_match, Container container,
                            MatchResultListener* listener) const {
     StlContainerReference stl_container = View::ConstReference(container);
     size_t i = 0;
-    for (typename StlContainer::const_iterator it = stl_container.begin();
-         it != stl_container.end(); ++it, ++i) {
+    for (auto it = stl_container.begin(); it != stl_container.end();
+         ++it, ++i) {
       StringMatchResultListener inner_listener;
       const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
 
@@ -2653,6 +2724,54 @@ class QuantifierMatcherImpl : public MatcherInterface<Container> {
     return all_elements_should_match;
   }
 
+  bool MatchAndExplainImpl(const Matcher<size_t>& count_matcher,
+                           Container container,
+                           MatchResultListener* listener) const {
+    StlContainerReference stl_container = View::ConstReference(container);
+    size_t i = 0;
+    std::vector<size_t> match_elements;
+    for (auto it = stl_container.begin(); it != stl_container.end();
+         ++it, ++i) {
+      StringMatchResultListener inner_listener;
+      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+      if (matches) {
+        match_elements.push_back(i);
+      }
+    }
+    if (listener->IsInterested()) {
+      if (match_elements.empty()) {
+        *listener << "has no element that matches";
+      } else if (match_elements.size() == 1) {
+        *listener << "whose element #" << match_elements[0] << " matches";
+      } else {
+        *listener << "whose elements (";
+        std::string sep = "";
+        for (size_t e : match_elements) {
+          *listener << sep << e;
+          sep = ", ";
+        }
+        *listener << ") match";
+      }
+    }
+    StringMatchResultListener count_listener;
+    if (count_matcher.MatchAndExplain(match_elements.size(), &count_listener)) {
+      *listener << " and whose match quantity of " << match_elements.size()
+                << " matches";
+      PrintIfNotEmpty(count_listener.str(), listener->stream());
+      return true;
+    } else {
+      if (match_elements.empty()) {
+        *listener << " and";
+      } else {
+        *listener << " but";
+      }
+      *listener << " whose match quantity of " << match_elements.size()
+                << " does not match";
+      PrintIfNotEmpty(count_listener.str(), listener->stream());
+      return false;
+    }
+  }
+
  protected:
   const Matcher<const Element&> inner_matcher_;
 };
@@ -2709,6 +2828,58 @@ class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
   }
 };
 
+// Implements Contains(element_matcher).Times(n) for the given argument type
+// Container.
+template <typename Container>
+class ContainsTimesMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit ContainsTimesMatcherImpl(InnerMatcher inner_matcher,
+                                    Matcher<size_t> count_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher),
+        count_matcher_(std::move(count_matcher)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "quantity of elements that match ";
+    this->inner_matcher_.DescribeTo(os);
+    *os << " ";
+    count_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "quantity of elements that match ";
+    this->inner_matcher_.DescribeTo(os);
+    *os << " ";
+    count_matcher_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(count_matcher_, container, listener);
+  }
+
+ private:
+  const Matcher<size_t> count_matcher_;
+};
+
+// Implements polymorphic Contains(element_matcher).Times(n).
+template <typename M>
+class ContainsTimesMatcher {
+ public:
+  explicit ContainsTimesMatcher(M m, Matcher<size_t> count_matcher)
+      : inner_matcher_(m), count_matcher_(std::move(count_matcher)) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {  // NOLINT
+    return Matcher<Container>(new ContainsTimesMatcherImpl<const Container&>(
+        inner_matcher_, count_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+  const Matcher<size_t> count_matcher_;
+};
+
 // Implements polymorphic Contains(element_matcher).
 template <typename M>
 class ContainsMatcher {
@@ -2716,11 +2887,15 @@ class ContainsMatcher {
   explicit ContainsMatcher(M m) : inner_matcher_(m) {}
 
   template <typename Container>
-  operator Matcher<Container>() const {
+  operator Matcher<Container>() const {  // NOLINT
     return Matcher<Container>(
         new ContainsMatcherImpl<const Container&>(inner_matcher_));
   }
 
+  ContainsTimesMatcher<M> Times(Matcher<size_t> count_matcher) const {
+    return ContainsTimesMatcher<M>(inner_matcher_, std::move(count_matcher));
+  }
+
  private:
   const M inner_matcher_;
 };
@@ -2732,7 +2907,7 @@ class EachMatcher {
   explicit EachMatcher(M m) : inner_matcher_(m) {}
 
   template <typename Container>
-  operator Matcher<Container>() const {
+  operator Matcher<Container>() const {  // NOLINT
     return Matcher<Container>(
         new EachMatcherImpl<const Container&>(inner_matcher_));
   }
@@ -2778,8 +2953,7 @@ class KeyMatcherImpl : public MatcherInterface<PairType> {
   template <typename InnerMatcher>
   explicit KeyMatcherImpl(InnerMatcher inner_matcher)
       : inner_matcher_(
-          testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {
-  }
+            testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {}
 
   // Returns true if and only if 'key_value.first' (the key) matches the inner
   // matcher.
@@ -2884,8 +3058,7 @@ class PairMatcherImpl : public MatcherInterface<PairType> {
       : first_matcher_(
             testing::SafeMatcherCast<const FirstType&>(first_matcher)),
         second_matcher_(
-            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {
-  }
+            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {}
 
   // Describes what this matcher does.
   void DescribeTo(::std::ostream* os) const override {
@@ -2963,7 +3136,7 @@ class PairMatcher {
       : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
 
   template <typename PairType>
-  operator Matcher<PairType> () const {
+  operator Matcher<PairType>() const {
     return Matcher<PairType>(
         new PairMatcherImpl<const PairType&>(first_matcher_, second_matcher_));
   }
@@ -3235,7 +3408,7 @@ class ElementsAreMatcherImpl : public MatcherInterface<Container> {
     // explanations[i] is the explanation of the element at index i.
     ::std::vector<std::string> explanations(count());
     StlContainerReference stl_container = View::ConstReference(container);
-    typename StlContainer::const_iterator it = stl_container.begin();
+    auto it = stl_container.begin();
     size_t exam_pos = 0;
     bool mismatch_found = false;  // Have we found a mismatched element yet?
 
@@ -3312,7 +3485,7 @@ class ElementsAreMatcherImpl : public MatcherInterface<Container> {
 
   size_t count() const { return matchers_.size(); }
 
-  ::std::vector<Matcher<const Element&> > matchers_;
+  ::std::vector<Matcher<const Element&>> matchers_;
 };
 
 // Connectivity matrix of (elements X matchers), in element-major order.
@@ -3324,8 +3497,7 @@ class GTEST_API_ MatchMatrix {
   MatchMatrix(size_t num_elements, size_t num_matchers)
       : num_elements_(num_elements),
         num_matchers_(num_matchers),
-        matched_(num_elements_* num_matchers_, 0) {
-  }
+        matched_(num_elements_ * num_matchers_, 0) {}
 
   size_t LhsSize() const { return num_elements_; }
   size_t RhsSize() const { return num_matchers_; }
@@ -3364,8 +3536,7 @@ typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
 
 // Returns a maximum bipartite matching for the specified graph 'g'.
 // The matching is represented as a vector of {element, matcher} pairs.
-GTEST_API_ ElementMatcherPairs
-FindMaxBipartiteMatching(const MatchMatrix& g);
+GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g);
 
 struct UnorderedMatcherRequire {
   enum Flags {
@@ -3402,9 +3573,7 @@ class GTEST_API_ UnorderedElementsAreMatcherImplBase {
   bool FindPairing(const MatchMatrix& matrix,
                    MatchResultListener* listener) const;
 
-  MatcherDescriberVec& matcher_describers() {
-    return matcher_describers_;
-  }
+  MatcherDescriberVec& matcher_describers() { return matcher_describers_; }
 
   static Message Elements(size_t n) {
     return Message() << n << " element" << (n == 1 ? "" : "s");
@@ -3428,7 +3597,6 @@ class UnorderedElementsAreMatcherImpl
   typedef internal::StlContainerView<RawContainer> View;
   typedef typename View::type StlContainer;
   typedef typename View::const_reference StlContainerReference;
-  typedef typename StlContainer::const_iterator StlContainerConstIterator;
   typedef typename StlContainer::value_type Element;
 
   template <typename InputIter>
@@ -3511,7 +3679,7 @@ class UnorderedElementsAreMatcherImpl
     return matrix;
   }
 
-  ::std::vector<Matcher<const Element&> > matchers_;
+  ::std::vector<Matcher<const Element&>> matchers_;
 };
 
 // Functor for use in TransformTuple.
@@ -3536,7 +3704,7 @@ class UnorderedElementsAreMatcher {
     typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
     typedef typename internal::StlContainerView<RawContainer>::type View;
     typedef typename View::value_type Element;
-    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    typedef ::std::vector<Matcher<const Element&>> MatcherVec;
     MatcherVec matchers;
     matchers.reserve(::std::tuple_size<MatcherTuple>::value);
     TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
@@ -3559,15 +3727,15 @@ class ElementsAreMatcher {
 
   template <typename Container>
   operator Matcher<Container>() const {
-    GTEST_COMPILE_ASSERT_(
+    static_assert(
         !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value ||
             ::std::tuple_size<MatcherTuple>::value < 2,
-        use_UnorderedElementsAre_with_hash_tables);
+        "use UnorderedElementsAre with hash tables");
 
     typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
     typedef typename internal::StlContainerView<RawContainer>::type View;
     typedef typename View::value_type Element;
-    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    typedef ::std::vector<Matcher<const Element&>> MatcherVec;
     MatcherVec matchers;
     matchers.reserve(::std::tuple_size<MatcherTuple>::value);
     TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
@@ -3610,9 +3778,9 @@ class ElementsAreArrayMatcher {
 
   template <typename Container>
   operator Matcher<Container>() const {
-    GTEST_COMPILE_ASSERT_(
+    static_assert(
         !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value,
-        use_UnorderedElementsAreArray_with_hash_tables);
+        "use UnorderedElementsAreArray with hash tables");
 
     return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
         matchers_.begin(), matchers_.end()));
@@ -3702,9 +3870,9 @@ BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
 // 'negation' is false; otherwise returns the description of the
 // negation of the matcher.  'param_values' contains a list of strings
 // that are the print-out of the matcher's parameters.
-GTEST_API_ std::string FormatMatcherDescription(bool negation,
-                                                const char* matcher_name,
-                                                const Strings& param_values);
+GTEST_API_ std::string FormatMatcherDescription(
+    bool negation, const char* matcher_name,
+    const std::vector<const char*>& param_names, const Strings& param_values);
 
 // Implements a matcher that checks the value of a optional<> type variable.
 template <typename ValueMatcher>
@@ -3981,26 +4149,26 @@ ElementsAreArray(Iter first, Iter last) {
 }
 
 template <typename T>
-inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
-    const T* pointer, size_t count) {
+inline auto ElementsAreArray(const T* pointer, size_t count)
+    -> decltype(ElementsAreArray(pointer, pointer + count)) {
   return ElementsAreArray(pointer, pointer + count);
 }
 
 template <typename T, size_t N>
-inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
-    const T (&array)[N]) {
+inline auto ElementsAreArray(const T (&array)[N])
+    -> decltype(ElementsAreArray(array, N)) {
   return ElementsAreArray(array, N);
 }
 
 template <typename Container>
-inline internal::ElementsAreArrayMatcher<typename Container::value_type>
-ElementsAreArray(const Container& container) {
+inline auto ElementsAreArray(const Container& container)
+    -> decltype(ElementsAreArray(container.begin(), container.end())) {
   return ElementsAreArray(container.begin(), container.end());
 }
 
 template <typename T>
-inline internal::ElementsAreArrayMatcher<T>
-ElementsAreArray(::std::initializer_list<T> xs) {
+inline auto ElementsAreArray(::std::initializer_list<T> xs)
+    -> decltype(ElementsAreArray(xs.begin(), xs.end())) {
   return ElementsAreArray(xs.begin(), xs.end());
 }
 
@@ -4027,14 +4195,14 @@ UnorderedElementsAreArray(Iter first, Iter last) {
 }
 
 template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T>
-UnorderedElementsAreArray(const T* pointer, size_t count) {
+inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
+    const T* pointer, size_t count) {
   return UnorderedElementsAreArray(pointer, pointer + count);
 }
 
 template <typename T, size_t N>
-inline internal::UnorderedElementsAreArrayMatcher<T>
-UnorderedElementsAreArray(const T (&array)[N]) {
+inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
+    const T (&array)[N]) {
   return UnorderedElementsAreArray(array, N);
 }
 
@@ -4046,8 +4214,8 @@ UnorderedElementsAreArray(const Container& container) {
 }
 
 template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T>
-UnorderedElementsAreArray(::std::initializer_list<T> xs) {
+inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
+    ::std::initializer_list<T> xs) {
   return UnorderedElementsAreArray(xs.begin(), xs.end());
 }
 
@@ -4081,14 +4249,14 @@ Matcher<T> internal::MatcherCastImpl<T, M>::CastImpl(
 }
 
 // Creates a polymorphic matcher that matches any NULL pointer.
-inline PolymorphicMatcher<internal::IsNullMatcher > IsNull() {
+inline PolymorphicMatcher<internal::IsNullMatcher> IsNull() {
   return MakePolymorphicMatcher(internal::IsNullMatcher());
 }
 
 // Creates a polymorphic matcher that matches any non-NULL pointer.
 // This is convenient as Not(NULL) doesn't compile (the compiler
 // thinks that that expression is comparing a pointer with an integer).
-inline PolymorphicMatcher<internal::NotNullMatcher > NotNull() {
+inline PolymorphicMatcher<internal::NotNullMatcher> NotNull() {
   return MakePolymorphicMatcher(internal::NotNullMatcher());
 }
 
@@ -4119,8 +4287,8 @@ inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
 // Creates a matcher that matches any double argument approximately equal to
 // rhs, up to the specified max absolute error bound, where two NANs are
 // considered unequal.  The max absolute error bound must be non-negative.
-inline internal::FloatingEqMatcher<double> DoubleNear(
-    double rhs, double max_abs_error) {
+inline internal::FloatingEqMatcher<double> DoubleNear(double rhs,
+                                                      double max_abs_error) {
   return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
 }
 
@@ -4147,8 +4315,8 @@ inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
 // Creates a matcher that matches any float argument approximately equal to
 // rhs, up to the specified max absolute error bound, where two NANs are
 // considered unequal.  The max absolute error bound must be non-negative.
-inline internal::FloatingEqMatcher<float> FloatNear(
-    float rhs, float max_abs_error) {
+inline internal::FloatingEqMatcher<float> FloatNear(float rhs,
+                                                    float max_abs_error) {
   return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
 }
 
@@ -4176,7 +4344,7 @@ inline internal::PointeeMatcher<InnerMatcher> Pointee(
 // If To is a reference and the cast fails, this matcher returns false
 // immediately.
 template <typename To>
-inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To> >
+inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To>>
 WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
   return MakePolymorphicMatcher(
       internal::WhenDynamicCastToMatcher<To>(inner_matcher));
@@ -4188,12 +4356,10 @@ WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
 //   Field(&Foo::number, Ge(5))
 // matches a Foo object x if and only if x.number >= 5.
 template <typename Class, typename FieldType, typename FieldMatcher>
-inline PolymorphicMatcher<
-  internal::FieldMatcher<Class, FieldType> > Field(
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType>> Field(
     FieldType Class::*field, const FieldMatcher& matcher) {
-  return MakePolymorphicMatcher(
-      internal::FieldMatcher<Class, FieldType>(
-          field, MatcherCast<const FieldType&>(matcher)));
+  return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
+      field, MatcherCast<const FieldType&>(matcher)));
   // The call to MatcherCast() is required for supporting inner
   // matchers of compatible types.  For example, it allows
   //   Field(&Foo::bar, m)
@@ -4203,7 +4369,7 @@ inline PolymorphicMatcher<
 // Same as Field() but also takes the name of the field to provide better error
 // messages.
 template <typename Class, typename FieldType, typename FieldMatcher>
-inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType> > Field(
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType>> Field(
     const std::string& field_name, FieldType Class::*field,
     const FieldMatcher& matcher) {
   return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
@@ -4216,7 +4382,7 @@ inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType> > Field(
 // matches a Foo object x if and only if x.str() starts with "hi".
 template <typename Class, typename PropertyType, typename PropertyMatcher>
 inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const> >
+    Class, PropertyType, PropertyType (Class::*)() const>>
 Property(PropertyType (Class::*property)() const,
          const PropertyMatcher& matcher) {
   return MakePolymorphicMatcher(
@@ -4233,7 +4399,7 @@ Property(PropertyType (Class::*property)() const,
 // better error messages.
 template <typename Class, typename PropertyType, typename PropertyMatcher>
 inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const> >
+    Class, PropertyType, PropertyType (Class::*)() const>>
 Property(const std::string& property_name,
          PropertyType (Class::*property)() const,
          const PropertyMatcher& matcher) {
@@ -4246,8 +4412,8 @@ Property(const std::string& property_name,
 // The same as above but for reference-qualified member functions.
 template <typename Class, typename PropertyType, typename PropertyMatcher>
 inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const &> >
-Property(PropertyType (Class::*property)() const &,
+    Class, PropertyType, PropertyType (Class::*)() const&>>
+Property(PropertyType (Class::*property)() const&,
          const PropertyMatcher& matcher) {
   return MakePolymorphicMatcher(
       internal::PropertyMatcher<Class, PropertyType,
@@ -4258,9 +4424,9 @@ Property(PropertyType (Class::*property)() const &,
 // Three-argument form for reference-qualified member functions.
 template <typename Class, typename PropertyType, typename PropertyMatcher>
 inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const &> >
+    Class, PropertyType, PropertyType (Class::*)() const&>>
 Property(const std::string& property_name,
-         PropertyType (Class::*property)() const &,
+         PropertyType (Class::*property)() const&,
          const PropertyMatcher& matcher) {
   return MakePolymorphicMatcher(
       internal::PropertyMatcher<Class, PropertyType,
@@ -4279,15 +4445,25 @@ Property(const std::string& property_name,
 template <typename Callable, typename InnerMatcher>
 internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
     Callable callable, InnerMatcher matcher) {
+  return internal::ResultOfMatcher<Callable, InnerMatcher>(std::move(callable),
+                                                           std::move(matcher));
+}
+
+// Same as ResultOf() above, but also takes a description of the `callable`
+// result to provide better error messages.
+template <typename Callable, typename InnerMatcher>
+internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
+    const std::string& result_description, Callable callable,
+    InnerMatcher matcher) {
   return internal::ResultOfMatcher<Callable, InnerMatcher>(
-      std::move(callable), std::move(matcher));
+      result_description, std::move(callable), std::move(matcher));
 }
 
 // String matchers.
 
 // Matches a string equal to str.
 template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrEq(
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrEq(
     const internal::StringLike<T>& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::string>(std::string(str), true, true));
@@ -4295,7 +4471,7 @@ PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrEq(
 
 // Matches a string not equal to str.
 template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrNe(
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrNe(
     const internal::StringLike<T>& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::string>(std::string(str), false, true));
@@ -4303,7 +4479,7 @@ PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrNe(
 
 // Matches a string equal to str, ignoring case.
 template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseEq(
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrCaseEq(
     const internal::StringLike<T>& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::string>(std::string(str), true, false));
@@ -4311,7 +4487,7 @@ PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseEq(
 
 // Matches a string not equal to str, ignoring case.
 template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseNe(
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrCaseNe(
     const internal::StringLike<T>& str) {
   return MakePolymorphicMatcher(internal::StrEqualityMatcher<std::string>(
       std::string(str), false, false));
@@ -4320,7 +4496,7 @@ PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseNe(
 // Creates a matcher that matches any string, std::string, or C string
 // that contains the given substring.
 template <typename T = std::string>
-PolymorphicMatcher<internal::HasSubstrMatcher<std::string> > HasSubstr(
+PolymorphicMatcher<internal::HasSubstrMatcher<std::string>> HasSubstr(
     const internal::StringLike<T>& substring) {
   return MakePolymorphicMatcher(
       internal::HasSubstrMatcher<std::string>(std::string(substring)));
@@ -4328,7 +4504,7 @@ PolymorphicMatcher<internal::HasSubstrMatcher<std::string> > HasSubstr(
 
 // Matches a string that starts with 'prefix' (case-sensitive).
 template <typename T = std::string>
-PolymorphicMatcher<internal::StartsWithMatcher<std::string> > StartsWith(
+PolymorphicMatcher<internal::StartsWithMatcher<std::string>> StartsWith(
     const internal::StringLike<T>& prefix) {
   return MakePolymorphicMatcher(
       internal::StartsWithMatcher<std::string>(std::string(prefix)));
@@ -4336,7 +4512,7 @@ PolymorphicMatcher<internal::StartsWithMatcher<std::string> > StartsWith(
 
 // Matches a string that ends with 'suffix' (case-sensitive).
 template <typename T = std::string>
-PolymorphicMatcher<internal::EndsWithMatcher<std::string> > EndsWith(
+PolymorphicMatcher<internal::EndsWithMatcher<std::string>> EndsWith(
     const internal::StringLike<T>& suffix) {
   return MakePolymorphicMatcher(
       internal::EndsWithMatcher<std::string>(std::string(suffix)));
@@ -4346,50 +4522,50 @@ PolymorphicMatcher<internal::EndsWithMatcher<std::string> > EndsWith(
 // Wide string matchers.
 
 // Matches a string equal to str.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrEq(
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrEq(
     const std::wstring& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::wstring>(str, true, true));
 }
 
 // Matches a string not equal to str.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrNe(
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrNe(
     const std::wstring& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::wstring>(str, false, true));
 }
 
 // Matches a string equal to str, ignoring case.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
-StrCaseEq(const std::wstring& str) {
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrCaseEq(
+    const std::wstring& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::wstring>(str, true, false));
 }
 
 // Matches a string not equal to str, ignoring case.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
-StrCaseNe(const std::wstring& str) {
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrCaseNe(
+    const std::wstring& str) {
   return MakePolymorphicMatcher(
       internal::StrEqualityMatcher<std::wstring>(str, false, false));
 }
 
 // Creates a matcher that matches any ::wstring, std::wstring, or C wide string
 // that contains the given substring.
-inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring> > HasSubstr(
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring>> HasSubstr(
     const std::wstring& substring) {
   return MakePolymorphicMatcher(
       internal::HasSubstrMatcher<std::wstring>(substring));
 }
 
 // Matches a string that starts with 'prefix' (case-sensitive).
-inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring> >
-StartsWith(const std::wstring& prefix) {
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring>> StartsWith(
+    const std::wstring& prefix) {
   return MakePolymorphicMatcher(
       internal::StartsWithMatcher<std::wstring>(prefix));
 }
 
 // Matches a string that ends with 'suffix' (case-sensitive).
-inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring> > EndsWith(
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring>> EndsWith(
     const std::wstring& suffix) {
   return MakePolymorphicMatcher(
       internal::EndsWithMatcher<std::wstring>(suffix));
@@ -4484,8 +4660,8 @@ inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
 // predicate.  The predicate can be any unary function or functor
 // whose return type can be implicitly converted to bool.
 template <typename Predicate>
-inline PolymorphicMatcher<internal::TrulyMatcher<Predicate> >
-Truly(Predicate pred) {
+inline PolymorphicMatcher<internal::TrulyMatcher<Predicate>> Truly(
+    Predicate pred) {
   return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
 }
 
@@ -4496,8 +4672,8 @@ Truly(Predicate pred) {
 //   EXPECT_THAT(container, SizeIs(2));     // Checks container has 2 elements.
 //   EXPECT_THAT(container, SizeIs(Le(2));  // Checks container has at most 2.
 template <typename SizeMatcher>
-inline internal::SizeIsMatcher<SizeMatcher>
-SizeIs(const SizeMatcher& size_matcher) {
+inline internal::SizeIsMatcher<SizeMatcher> SizeIs(
+    const SizeMatcher& size_matcher) {
   return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
 }
 
@@ -4507,8 +4683,8 @@ SizeIs(const SizeMatcher& size_matcher) {
 // do not implement size(). The container must provide const_iterator (with
 // valid iterator_traits), begin() and end().
 template <typename DistanceMatcher>
-inline internal::BeginEndDistanceIsMatcher<DistanceMatcher>
-BeginEndDistanceIs(const DistanceMatcher& distance_matcher) {
+inline internal::BeginEndDistanceIsMatcher<DistanceMatcher> BeginEndDistanceIs(
+    const DistanceMatcher& distance_matcher) {
   return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
 }
 
@@ -4517,8 +4693,8 @@ BeginEndDistanceIs(const DistanceMatcher& distance_matcher) {
 // values that are included in one container but not the other. (Duplicate
 // values and order differences are not explained.)
 template <typename Container>
-inline PolymorphicMatcher<internal::ContainerEqMatcher<
-    typename std::remove_const<Container>::type>>
+inline PolymorphicMatcher<
+    internal::ContainerEqMatcher<typename std::remove_const<Container>::type>>
 ContainerEq(const Container& rhs) {
   return MakePolymorphicMatcher(internal::ContainerEqMatcher<Container>(rhs));
 }
@@ -4526,9 +4702,8 @@ ContainerEq(const Container& rhs) {
 // Returns a matcher that matches a container that, when sorted using
 // the given comparator, matches container_matcher.
 template <typename Comparator, typename ContainerMatcher>
-inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher>
-WhenSortedBy(const Comparator& comparator,
-             const ContainerMatcher& container_matcher) {
+inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher> WhenSortedBy(
+    const Comparator& comparator, const ContainerMatcher& container_matcher) {
   return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
       comparator, container_matcher);
 }
@@ -4538,9 +4713,9 @@ WhenSortedBy(const Comparator& comparator,
 template <typename ContainerMatcher>
 inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
 WhenSorted(const ContainerMatcher& container_matcher) {
-  return
-      internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>(
-          internal::LessComparator(), container_matcher);
+  return internal::WhenSortedByMatcher<internal::LessComparator,
+                                       ContainerMatcher>(
+      internal::LessComparator(), container_matcher);
 }
 
 // Matches an STL-style container or a native array that contains the
@@ -4557,15 +4732,13 @@ Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
                                                              rhs);
 }
 
-
 // Supports the Pointwise(m, {a, b, c}) syntax.
 template <typename TupleMatcher, typename T>
-inline internal::PointwiseMatcher<TupleMatcher, std::vector<T> > Pointwise(
+inline internal::PointwiseMatcher<TupleMatcher, std::vector<T>> Pointwise(
     const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
   return Pointwise(tuple_matcher, std::vector<T>(rhs));
 }
 
-
 // UnorderedPointwise(pair_matcher, rhs) matches an STL-style
 // container or a native array that contains the same number of
 // elements as in rhs, where in some permutation of the container, its
@@ -4594,28 +4767,25 @@ UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
       RhsView::ConstReference(rhs_container);
 
   // Create a matcher for each element in rhs_container.
-  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second> > matchers;
-  for (typename RhsStlContainer::const_iterator it = rhs_stl_container.begin();
-       it != rhs_stl_container.end(); ++it) {
-    matchers.push_back(
-        internal::MatcherBindSecond(tuple2_matcher, *it));
+  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second>> matchers;
+  for (auto it = rhs_stl_container.begin(); it != rhs_stl_container.end();
+       ++it) {
+    matchers.push_back(internal::MatcherBindSecond(tuple2_matcher, *it));
   }
 
   // Delegate the work to UnorderedElementsAreArray().
   return UnorderedElementsAreArray(matchers);
 }
 
-
 // Supports the UnorderedPointwise(m, {a, b, c}) syntax.
 template <typename Tuple2Matcher, typename T>
 inline internal::UnorderedElementsAreArrayMatcher<
-    typename internal::BoundSecondMatcher<Tuple2Matcher, T> >
+    typename internal::BoundSecondMatcher<Tuple2Matcher, T>>
 UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
                    std::initializer_list<T> rhs) {
   return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
 }
 
-
 // Matches an STL-style container or a native array that contains at
 // least one element matching the given value or matcher.
 //
@@ -4625,7 +4795,7 @@ UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
 //   page_ids.insert(1);
 //   EXPECT_THAT(page_ids, Contains(1));
 //   EXPECT_THAT(page_ids, Contains(Gt(2)));
-//   EXPECT_THAT(page_ids, Not(Contains(4)));
+//   EXPECT_THAT(page_ids, Not(Contains(4)));  // See below for Times(0)
 //
 //   ::std::map<int, size_t> page_lengths;
 //   page_lengths[1] = 100;
@@ -4634,6 +4804,19 @@ UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
 //
 //   const char* user_ids[] = { "joe", "mike", "tom" };
 //   EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
+//
+// The matcher supports a modifier `Times` that allows to check for arbitrary
+// occurrences including testing for absence with Times(0).
+//
+// Examples:
+//   ::std::vector<int> ids;
+//   ids.insert(1);
+//   ids.insert(1);
+//   ids.insert(3);
+//   EXPECT_THAT(ids, Contains(1).Times(2));      // 1 occurs 2 times
+//   EXPECT_THAT(ids, Contains(2).Times(0));      // 2 is not present
+//   EXPECT_THAT(ids, Contains(3).Times(Ge(1)));  // 3 occurs at least once
+
 template <typename M>
 inline internal::ContainsMatcher<M> Contains(M matcher) {
   return internal::ContainsMatcher<M>(matcher);
@@ -4760,7 +4943,7 @@ inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
 // Matches an STL-style container or a native array that contains only
 // elements matching the given value or matcher.
 //
-// Each(m) is semantically equivalent to Not(Contains(Not(m))). Only
+// Each(m) is semantically equivalent to `Not(Contains(Not(m)))`. Only
 // the messages are different.
 //
 // Examples:
@@ -4803,13 +4986,25 @@ inline internal::KeyMatcher<M> Key(M inner_matcher) {
 // to match a std::map<int, string> that contains exactly one element whose key
 // is >= 5 and whose value equals "foo".
 template <typename FirstMatcher, typename SecondMatcher>
-inline internal::PairMatcher<FirstMatcher, SecondMatcher>
-Pair(FirstMatcher first_matcher, SecondMatcher second_matcher) {
-  return internal::PairMatcher<FirstMatcher, SecondMatcher>(
-      first_matcher, second_matcher);
+inline internal::PairMatcher<FirstMatcher, SecondMatcher> Pair(
+    FirstMatcher first_matcher, SecondMatcher second_matcher) {
+  return internal::PairMatcher<FirstMatcher, SecondMatcher>(first_matcher,
+                                                            second_matcher);
 }
 
 namespace no_adl {
+// Conditional() creates a matcher that conditionally uses either the first or
+// second matcher provided. For example, we could create an `equal if, and only
+// if' matcher using the Conditional wrapper as follows:
+//
+//   EXPECT_THAT(result, Conditional(condition, Eq(expected), Ne(expected)));
+template <typename MatcherTrue, typename MatcherFalse>
+internal::ConditionalMatcher<MatcherTrue, MatcherFalse> Conditional(
+    bool condition, MatcherTrue matcher_true, MatcherFalse matcher_false) {
+  return internal::ConditionalMatcher<MatcherTrue, MatcherFalse>(
+      condition, std::move(matcher_true), std::move(matcher_false));
+}
+
 // FieldsAre(matchers...) matches piecewise the fields of compatible structs.
 // These include those that support `get<I>(obj)`, and when structured bindings
 // are enabled any class that supports them.
@@ -4836,6 +5031,14 @@ inline internal::AddressMatcher<InnerMatcher> Address(
     const InnerMatcher& inner_matcher) {
   return internal::AddressMatcher<InnerMatcher>(inner_matcher);
 }
+
+// Matches a base64 escaped string, when the unescaped string matches the
+// internal matcher.
+template <typename MatcherType>
+internal::WhenBase64UnescapedMatcher WhenBase64Unescaped(
+    const MatcherType& internal_matcher) {
+  return internal::WhenBase64UnescapedMatcher(internal_matcher);
+}
 }  // namespace no_adl
 
 // Returns a predicate that is satisfied by anything that matches the
@@ -4854,8 +5057,8 @@ inline bool Value(const T& value, M matcher) {
 // Matches the value against the given matcher and explains the match
 // result to listener.
 template <typename T, typename M>
-inline bool ExplainMatchResult(
-    M matcher, const T& value, MatchResultListener* listener) {
+inline bool ExplainMatchResult(M matcher, const T& value,
+                               MatchResultListener* listener) {
   return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
 }
 
@@ -4865,7 +5068,8 @@ inline bool ExplainMatchResult(
 //
 // MATCHER_P(XAndYThat, matcher,
 //           "X that " + DescribeMatcher<int>(matcher, negation) +
-//               " and Y that " + DescribeMatcher<double>(matcher, negation)) {
+//               (negation ? " or" : " and") + " Y that " +
+//               DescribeMatcher<double>(matcher, negation)) {
 //   return ExplainMatchResult(matcher, arg.x(), result_listener) &&
 //          ExplainMatchResult(matcher, arg.y(), result_listener);
 // }
@@ -5014,7 +5218,9 @@ internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...> Args(
 //
 //   EXPECT_CALL(foo, Bar(_, _)).With(Eq());
 template <typename InnerMatcher>
-inline InnerMatcher AllArgs(const InnerMatcher& matcher) { return matcher; }
+inline InnerMatcher AllArgs(const InnerMatcher& matcher) {
+  return matcher;
+}
 
 // Returns a matcher that matches the value of an optional<> type variable.
 // The matcher implementation only uses '!arg' and requires that the optional<>
@@ -5032,7 +5238,7 @@ inline internal::OptionalMatcher<ValueMatcher> Optional(
 
 // Returns a matcher that matches the value of a absl::any type variable.
 template <typename T>
-PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T> > AnyWith(
+PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T>> AnyWith(
     const Matcher<const T&>& matcher) {
   return MakePolymorphicMatcher(
       internal::any_cast_matcher::AnyCastMatcher<T>(matcher));
@@ -5043,7 +5249,7 @@ PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T> > AnyWith(
 // functions.
 // It is compatible with std::variant.
 template <typename T>
-PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T> > VariantWith(
+PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T>> VariantWith(
     const Matcher<const T&>& matcher) {
   return MakePolymorphicMatcher(
       internal::variant_matcher::VariantMatcher<T>(matcher));
@@ -5072,7 +5278,8 @@ class WithWhatMatcherImpl {
 
   template <typename Err>
   bool MatchAndExplain(const Err& err, MatchResultListener* listener) const {
-    *listener << "which contains .what() that ";
+    *listener << "which contains .what() (of value = " << err.what()
+              << ") that ";
     return matcher_.MatchAndExplain(err.what(), listener);
   }
 
@@ -5222,12 +5429,14 @@ PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
 // tests.  ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
 // succeed if and only if the value matches the matcher.  If the assertion
 // fails, the value and the description of the matcher will be printed.
-#define ASSERT_THAT(value, matcher) ASSERT_PRED_FORMAT1(\
-    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
-#define EXPECT_THAT(value, matcher) EXPECT_PRED_FORMAT1(\
-    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
-
-// MATCHER* macroses itself are listed below.
+#define ASSERT_THAT(value, matcher) \
+  ASSERT_PRED_FORMAT1(              \
+      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_THAT(value, matcher) \
+  EXPECT_PRED_FORMAT1(              \
+      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+
+// MATCHER* macros itself are listed below.
 #define MATCHER(name, description)                                             \
   class name##Matcher                                                          \
       : public ::testing::internal::MatcherBaseImpl<name##Matcher> {           \
@@ -5248,12 +5457,13 @@ PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
                                                                                \
      private:                                                                  \
       ::std::string FormatDescription(bool negation) const {                   \
+        /* NOLINTNEXTLINE readability-redundant-string-init */                 \
         ::std::string gmock_description = (description);                       \
         if (!gmock_description.empty()) {                                      \
           return gmock_description;                                            \
         }                                                                      \
         return ::testing::internal::FormatMatcherDescription(negation, #name,  \
-                                                             {});              \
+                                                             {}, {});          \
       }                                                                        \
     };                                                                         \
   };                                                                           \
@@ -5265,33 +5475,41 @@ PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
       const
 
 #define MATCHER_P(name, p0, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP, description, (p0))
-#define MATCHER_P2(name, p0, p1, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP2, description, (p0, p1))
-#define MATCHER_P3(name, p0, p1, p2, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP3, description, (p0, p1, p2))
-#define MATCHER_P4(name, p0, p1, p2, p3, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP4, description, (p0, p1, p2, p3))
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP, description, (#p0), (p0))
+#define MATCHER_P2(name, p0, p1, description)                            \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP2, description, (#p0, #p1), \
+                         (p0, p1))
+#define MATCHER_P3(name, p0, p1, p2, description)                             \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP3, description, (#p0, #p1, #p2), \
+                         (p0, p1, p2))
+#define MATCHER_P4(name, p0, p1, p2, p3, description)        \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP4, description, \
+                         (#p0, #p1, #p2, #p3), (p0, p1, p2, p3))
 #define MATCHER_P5(name, p0, p1, p2, p3, p4, description)    \
   GMOCK_INTERNAL_MATCHER(name, name##MatcherP5, description, \
-                         (p0, p1, p2, p3, p4))
+                         (#p0, #p1, #p2, #p3, #p4), (p0, p1, p2, p3, p4))
 #define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description) \
   GMOCK_INTERNAL_MATCHER(name, name##MatcherP6, description,  \
+                         (#p0, #p1, #p2, #p3, #p4, #p5),      \
                          (p0, p1, p2, p3, p4, p5))
 #define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description) \
   GMOCK_INTERNAL_MATCHER(name, name##MatcherP7, description,      \
+                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6),     \
                          (p0, p1, p2, p3, p4, p5, p6))
 #define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description) \
   GMOCK_INTERNAL_MATCHER(name, name##MatcherP8, description,          \
+                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7),    \
                          (p0, p1, p2, p3, p4, p5, p6, p7))
 #define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description) \
   GMOCK_INTERNAL_MATCHER(name, name##MatcherP9, description,              \
+                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7, #p8),   \
                          (p0, p1, p2, p3, p4, p5, p6, p7, p8))
 #define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description) \
   GMOCK_INTERNAL_MATCHER(name, name##MatcherP10, description,                  \
+                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7, #p8, #p9),   \
                          (p0, p1, p2, p3, p4, p5, p6, p7, p8, p9))
 
-#define GMOCK_INTERNAL_MATCHER(name, full_name, description, args)             \
+#define GMOCK_INTERNAL_MATCHER(name, full_name, description, arg_names, args)  \
   template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
   class full_name : public ::testing::internal::MatcherBaseImpl<               \
                         full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>> { \
@@ -5320,7 +5538,7 @@ PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
           return gmock_description;                                            \
         }                                                                      \
         return ::testing::internal::FormatMatcherDescription(                  \
-            negation, #name,                                                   \
+            negation, #name, {GMOCK_PP_REMOVE_PARENS(arg_names)},              \
             ::testing::internal::UniversalTersePrintTupleFieldsToStrings(      \
                 ::std::tuple<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(        \
                     GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args))));             \
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h b/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
index fd293358a..148ac0172 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
@@ -27,12 +27,12 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements some commonly used variadic actions.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
@@ -129,170 +129,207 @@
 
 // Declares the template parameters.
 #define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
-#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
-    name1) kind0 name0, kind1 name1
+#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, name1) \
+  kind0 name0, kind1 name1
 #define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2) kind0 name0, kind1 name1, kind2 name2
+                                                  kind2, name2)               \
+  kind0 name0, kind1 name1, kind2 name2
 #define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3) kind0 name0, kind1 name1, kind2 name2, \
-    kind3 name3
-#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4) kind0 name0, kind1 name1, \
-    kind2 name2, kind3 name3, kind4 name4
+                                                  kind2, name2, kind3, name3) \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3
+#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4) \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4
 #define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5) kind0 name0, \
-    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
-#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
-    name6) kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
-    kind5 name5, kind6 name6
-#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
-    kind7, name7) kind0 name0, kind1 name1, kind2 name2, kind3 name3, \
-    kind4 name4, kind5 name5, kind6 name6, kind7 name7
-#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
-    kind7, name7, kind8, name8) kind0 name0, kind1 name1, kind2 name2, \
-    kind3 name3, kind4 name4, kind5 name5, kind6 name6, kind7 name7, \
-    kind8 name8
-#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
-    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
-    name6, kind7, name7, kind8, name8, kind9, name9) kind0 name0, \
-    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5, \
-    kind6 name6, kind7 name7, kind8 name8, kind9 name9
+                                                  kind2, name2, kind3, name3, \
+                                                  kind4, name4, kind5, name5) \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
+#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6)                                           \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
+      kind5 name5, kind6 name6
+#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6, kind7, name7)                             \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
+      kind5 name5, kind6 name6, kind7 name7
+#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6, kind7, name7, kind8, name8)               \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
+      kind5 name5, kind6 name6, kind7 name7, kind8 name8
+#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(                       \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6, kind7, name7, kind8, name8, kind9, name9) \
+  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
+      kind5 name5, kind6 name6, kind7 name7, kind8 name8, kind9 name9
 
 // Lists the template parameters.
 #define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
-#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
-    name1) name0, name1
+#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, name1) \
+  name0, name1
 #define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2) name0, name1, name2
+                                                  kind2, name2)               \
+  name0, name1, name2
 #define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3) name0, name1, name2, name3
-#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4) name0, name1, name2, name3, \
-    name4
+                                                  kind2, name2, kind3, name3) \
+  name0, name1, name2, name3
+#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4) \
+  name0, name1, name2, name3, name4
 #define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5) name0, name1, \
-    name2, name3, name4, name5
-#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
-    name6) name0, name1, name2, name3, name4, name5, name6
-#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
-    kind7, name7) name0, name1, name2, name3, name4, name5, name6, name7
-#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
-    kind7, name7, kind8, name8) name0, name1, name2, name3, name4, name5, \
-    name6, name7, name8
-#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
-    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
-    name6, kind7, name7, kind8, name8, kind9, name9) name0, name1, name2, \
-    name3, name4, name5, name6, name7, name8, name9
+                                                  kind2, name2, kind3, name3, \
+                                                  kind4, name4, kind5, name5) \
+  name0, name1, name2, name3, name4, name5
+#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6)                                           \
+  name0, name1, name2, name3, name4, name5, name6
+#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6, kind7, name7)                             \
+  name0, name1, name2, name3, name4, name5, name6, name7
+#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(                        \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6, kind7, name7, kind8, name8)               \
+  name0, name1, name2, name3, name4, name5, name6, name7, name8
+#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(                       \
+    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+    kind5, name5, kind6, name6, kind7, name7, kind8, name8, kind9, name9) \
+  name0, name1, name2, name3, name4, name5, name6, name7, name8, name9
 
 // Declares the types of value parameters.
 #define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
 #define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) , \
-    typename p0##_type, typename p1##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , \
-    typename p0##_type, typename p1##_type, typename p2##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
-    typename p0##_type, typename p1##_type, typename p2##_type, \
-    typename p3##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
-    typename p0##_type, typename p1##_type, typename p2##_type, \
-    typename p3##_type, typename p4##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
-    typename p0##_type, typename p1##_type, typename p2##_type, \
-    typename p3##_type, typename p4##_type, typename p5##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) \
+  , typename p0##_type, typename p1##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) \
+  , typename p0##_type, typename p1##_type, typename p2##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+  , typename p0##_type, typename p1##_type, typename p2##_type,     \
+      typename p3##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+  , typename p0##_type, typename p1##_type, typename p2##_type,         \
+      typename p3##_type, typename p4##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+  , typename p0##_type, typename p1##_type, typename p2##_type,             \
+      typename p3##_type, typename p4##_type, typename p5##_type
 #define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6) , typename p0##_type, typename p1##_type, typename p2##_type, \
-    typename p3##_type, typename p4##_type, typename p5##_type, \
-    typename p6##_type
+                                                    p6)                     \
+  , typename p0##_type, typename p1##_type, typename p2##_type,             \
+      typename p3##_type, typename p4##_type, typename p5##_type,           \
+      typename p6##_type
 #define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6, p7) , typename p0##_type, typename p1##_type, typename p2##_type, \
-    typename p3##_type, typename p4##_type, typename p5##_type, \
-    typename p6##_type, typename p7##_type
+                                                    p6, p7)                 \
+  , typename p0##_type, typename p1##_type, typename p2##_type,             \
+      typename p3##_type, typename p4##_type, typename p5##_type,           \
+      typename p6##_type, typename p7##_type
 #define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6, p7, p8) , typename p0##_type, typename p1##_type, typename p2##_type, \
-    typename p3##_type, typename p4##_type, typename p5##_type, \
-    typename p6##_type, typename p7##_type, typename p8##_type
+                                                    p6, p7, p8)             \
+  , typename p0##_type, typename p1##_type, typename p2##_type,             \
+      typename p3##_type, typename p4##_type, typename p5##_type,           \
+      typename p6##_type, typename p7##_type, typename p8##_type
 #define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6, p7, p8, p9) , typename p0##_type, typename p1##_type, \
-    typename p2##_type, typename p3##_type, typename p4##_type, \
-    typename p5##_type, typename p6##_type, typename p7##_type, \
-    typename p8##_type, typename p9##_type
+                                                     p6, p7, p8, p9)         \
+  , typename p0##_type, typename p1##_type, typename p2##_type,              \
+      typename p3##_type, typename p4##_type, typename p5##_type,            \
+      typename p6##_type, typename p7##_type, typename p8##_type,            \
+      typename p9##_type
 
 // Initializes the value parameters.
-#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS()\
-    ()
-#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0)\
-    (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
-#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1)\
-    (p0##_type gmock_p0, p1##_type gmock_p1) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1))
-#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, \
-        p2##_type gmock_p2) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2))
-#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS() ()
+#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0) \
+  (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
+#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1) \
+  (p0##_type gmock_p0, p1##_type gmock_p1)             \
+      : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1))
+#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)     \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2) \
+      : p0(::std::move(gmock_p0)),                             \
+        p1(::std::move(gmock_p1)),                             \
+        p2(::std::move(gmock_p2))
+#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+   p3##_type gmock_p3)                                         \
+      : p0(::std::move(gmock_p0)),                             \
+        p1(::std::move(gmock_p1)),                             \
+        p2(::std::move(gmock_p2)),                             \
         p3(::std::move(gmock_p3))
-#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3, p4##_type gmock_p4) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
-        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4))
-#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3, p4##_type gmock_p4, \
-        p5##_type gmock_p5) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
-        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,     \
+   p3##_type gmock_p3, p4##_type gmock_p4)                         \
+      : p0(::std::move(gmock_p0)),                                 \
+        p1(::std::move(gmock_p1)),                                 \
+        p2(::std::move(gmock_p2)),                                 \
+        p3(::std::move(gmock_p3)),                                 \
+        p4(::std::move(gmock_p4))
+#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,         \
+   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5)         \
+      : p0(::std::move(gmock_p0)),                                     \
+        p1(::std::move(gmock_p1)),                                     \
+        p2(::std::move(gmock_p2)),                                     \
+        p3(::std::move(gmock_p3)),                                     \
+        p4(::std::move(gmock_p4)),                                     \
         p5(::std::move(gmock_p5))
-#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
-        p6##_type gmock_p6) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
-        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
-        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6))
-#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
-        p6##_type gmock_p6, p7##_type gmock_p7) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
-        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
-        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,             \
+   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,             \
+   p6##_type gmock_p6)                                                     \
+      : p0(::std::move(gmock_p0)),                                         \
+        p1(::std::move(gmock_p1)),                                         \
+        p2(::std::move(gmock_p2)),                                         \
+        p3(::std::move(gmock_p3)),                                         \
+        p4(::std::move(gmock_p4)),                                         \
+        p5(::std::move(gmock_p5)),                                         \
+        p6(::std::move(gmock_p6))
+#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,                 \
+   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,                 \
+   p6##_type gmock_p6, p7##_type gmock_p7)                                     \
+      : p0(::std::move(gmock_p0)),                                             \
+        p1(::std::move(gmock_p1)),                                             \
+        p2(::std::move(gmock_p2)),                                             \
+        p3(::std::move(gmock_p3)),                                             \
+        p4(::std::move(gmock_p4)),                                             \
+        p5(::std::move(gmock_p5)),                                             \
+        p6(::std::move(gmock_p6)),                                             \
         p7(::std::move(gmock_p7))
-#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
-        p6##_type gmock_p6, p7##_type gmock_p7, \
-        p8##_type gmock_p8) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
-        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
-        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
-        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8))
+#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+                                               p8)                             \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,                 \
+   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,                 \
+   p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8)                 \
+      : p0(::std::move(gmock_p0)),                                             \
+        p1(::std::move(gmock_p1)),                                             \
+        p2(::std::move(gmock_p2)),                                             \
+        p3(::std::move(gmock_p3)),                                             \
+        p4(::std::move(gmock_p4)),                                             \
+        p5(::std::move(gmock_p5)),                                             \
+        p6(::std::move(gmock_p6)),                                             \
+        p7(::std::move(gmock_p7)),                                             \
+        p8(::std::move(gmock_p8))
 #define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8, p9)\
-    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
-        p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
-        p9##_type gmock_p9) : p0(::std::move(gmock_p0)), \
-        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
-        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
-        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
-        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8)), \
+                                                p7, p8, p9)                 \
+  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,              \
+   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,              \
+   p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8,              \
+   p9##_type gmock_p9)                                                      \
+      : p0(::std::move(gmock_p0)),                                          \
+        p1(::std::move(gmock_p1)),                                          \
+        p2(::std::move(gmock_p2)),                                          \
+        p3(::std::move(gmock_p3)),                                          \
+        p4(::std::move(gmock_p4)),                                          \
+        p5(::std::move(gmock_p5)),                                          \
+        p6(::std::move(gmock_p6)),                                          \
+        p7(::std::move(gmock_p7)),                                          \
+        p8(::std::move(gmock_p8)),                                          \
         p9(::std::move(gmock_p9))
 
 // Defines the copy constructor
 #define GMOCK_INTERNAL_DEFN_COPY_AND_0_VALUE_PARAMS() \
-    {}  // Avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82134
+  {}  // Avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82134
 #define GMOCK_INTERNAL_DEFN_COPY_AND_1_VALUE_PARAMS(...) = default;
 #define GMOCK_INTERNAL_DEFN_COPY_AND_2_VALUE_PARAMS(...) = default;
 #define GMOCK_INTERNAL_DEFN_COPY_AND_3_VALUE_PARAMS(...) = default;
@@ -307,30 +344,71 @@
 // Declares the fields for storing the value parameters.
 #define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
 #define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
-#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0; \
-    p1##_type p1;
-#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0; \
-    p1##_type p1; p2##_type p2;
-#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0; \
-    p1##_type p1; p2##_type p2; p3##_type p3;
-#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
-    p4) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4;
-#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
-    p5) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
-    p5##_type p5;
-#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
-    p5##_type p5; p6##_type p6;
-#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
-    p5##_type p5; p6##_type p6; p7##_type p7;
-#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
-    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8;
+#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) \
+  p0##_type p0;                                        \
+  p1##_type p1;
+#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) \
+  p0##_type p0;                                            \
+  p1##_type p1;                                            \
+  p2##_type p2;
+#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+  p0##_type p0;                                                \
+  p1##_type p1;                                                \
+  p2##_type p2;                                                \
+  p3##_type p3;
+#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+  p0##_type p0;                                                    \
+  p1##_type p1;                                                    \
+  p2##_type p2;                                                    \
+  p3##_type p3;                                                    \
+  p4##_type p4;
+#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+  p0##_type p0;                                                        \
+  p1##_type p1;                                                        \
+  p2##_type p2;                                                        \
+  p3##_type p3;                                                        \
+  p4##_type p4;                                                        \
+  p5##_type p5;
+#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+  p0##_type p0;                                                            \
+  p1##_type p1;                                                            \
+  p2##_type p2;                                                            \
+  p3##_type p3;                                                            \
+  p4##_type p4;                                                            \
+  p5##_type p5;                                                            \
+  p6##_type p6;
+#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+  p0##_type p0;                                                                \
+  p1##_type p1;                                                                \
+  p2##_type p2;                                                                \
+  p3##_type p3;                                                                \
+  p4##_type p4;                                                                \
+  p5##_type p5;                                                                \
+  p6##_type p6;                                                                \
+  p7##_type p7;
+#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+                                               p8)                             \
+  p0##_type p0;                                                                \
+  p1##_type p1;                                                                \
+  p2##_type p2;                                                                \
+  p3##_type p3;                                                                \
+  p4##_type p4;                                                                \
+  p5##_type p5;                                                                \
+  p6##_type p6;                                                                \
+  p7##_type p7;                                                                \
+  p8##_type p8;
 #define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8, p9) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
-    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8; \
-    p9##_type p9;
+                                                p7, p8, p9)                 \
+  p0##_type p0;                                                             \
+  p1##_type p1;                                                             \
+  p2##_type p2;                                                             \
+  p3##_type p3;                                                             \
+  p4##_type p4;                                                             \
+  p5##_type p5;                                                             \
+  p6##_type p6;                                                             \
+  p7##_type p7;                                                             \
+  p8##_type p8;                                                             \
+  p9##_type p9;
 
 // Lists the value parameters.
 #define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
@@ -338,72 +416,78 @@
 #define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
 #define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
 #define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
-#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) p0, p1, \
-    p2, p3, p4
-#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) p0, \
-    p1, p2, p3, p4, p5
-#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6) p0, p1, p2, p3, p4, p5, p6
-#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7) p0, p1, p2, p3, p4, p5, p6, p7
-#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8) p0, p1, p2, p3, p4, p5, p6, p7, p8
+#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+  p0, p1, p2, p3, p4
+#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+  p0, p1, p2, p3, p4, p5
+#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+  p0, p1, p2, p3, p4, p5, p6
+#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+  p0, p1, p2, p3, p4, p5, p6, p7
+#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+                                               p8)                             \
+  p0, p1, p2, p3, p4, p5, p6, p7, p8
 #define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8, p9) p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
+                                                p7, p8, p9)                 \
+  p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
 
 // Lists the value parameter types.
 #define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
 #define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) , p0##_type, \
-    p1##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , p0##_type, \
-    p1##_type, p2##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
-    p0##_type, p1##_type, p2##_type, p3##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
-    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
-    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) \
+  , p0##_type, p1##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) \
+  , p0##_type, p1##_type, p2##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+  , p0##_type, p1##_type, p2##_type, p3##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
 #define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
-    p6##_type
+                                                    p6)                     \
+  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, p6##_type
 #define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6, p7) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
-    p5##_type, p6##_type, p7##_type
+                                                    p6, p7)                 \
+  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type,       \
+      p6##_type, p7##_type
 #define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6, p7, p8) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
-    p5##_type, p6##_type, p7##_type, p8##_type
+                                                    p6, p7, p8)             \
+  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type,       \
+      p6##_type, p7##_type, p8##_type
 #define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6, p7, p8, p9) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
-    p5##_type, p6##_type, p7##_type, p8##_type, p9##_type
+                                                     p6, p7, p8, p9)         \
+  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type,        \
+      p6##_type, p7##_type, p8##_type, p9##_type
 
 // Declares the value parameters.
 #define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
 #define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
-#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0, \
-    p1##_type p1
-#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0, \
-    p1##_type p1, p2##_type p2
-#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0, \
-    p1##_type p1, p2##_type p2, p3##_type p3
-#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
-    p4) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
-#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
-    p5) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
-    p5##_type p5
-#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-    p6) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
-    p5##_type p5, p6##_type p6
-#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
-    p5##_type p5, p6##_type p6, p7##_type p7
-#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
-    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
+#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) \
+  p0##_type p0, p1##_type p1
+#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) \
+  p0##_type p0, p1##_type p1, p2##_type p2
+#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3
+#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
+#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)  \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+      p5##_type p5
+#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,    \
+      p5##_type p5, p6##_type p6
+#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,        \
+      p5##_type p5, p6##_type p6, p7##_type p7
+#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+                                               p8)                             \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,        \
+      p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
 #define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8, p9) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
-    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
-    p9##_type p9
+                                                p7, p8, p9)                 \
+  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,     \
+      p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, p9##_type p9
 
 // The suffix of the class template implementing the action template.
 #define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
@@ -415,40 +499,43 @@
 #define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
 #define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
 #define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7) P8
+                                                p7)                         \
+  P8
 #define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8) P9
+                                                p7, p8)                     \
+  P9
 #define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-    p7, p8, p9) P10
+                                                 p7, p8, p9)                 \
+  P10
 
 // The name of the class template implementing the action template.
-#define GMOCK_ACTION_CLASS_(name, value_params)\
-    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+#define GMOCK_ACTION_CLASS_(name, value_params) \
+  GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
 
 #define ACTION_TEMPLATE(name, template_params, value_params)                   \
   template <GMOCK_INTERNAL_DECL_##template_params                              \
-            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
+                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
   class GMOCK_ACTION_CLASS_(name, value_params) {                              \
    public:                                                                     \
     explicit GMOCK_ACTION_CLASS_(name, value_params)(                          \
         GMOCK_INTERNAL_DECL_##value_params)                                    \
         GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),    \
-                    = default; ,                                               \
+                    = default;                                                 \
+                    ,                                                          \
                     : impl_(std::make_shared<gmock_Impl>(                      \
-                                GMOCK_INTERNAL_LIST_##value_params)) { })      \
-    GMOCK_ACTION_CLASS_(name, value_params)(                                   \
-        const GMOCK_ACTION_CLASS_(name, value_params)&) noexcept               \
-        GMOCK_INTERNAL_DEFN_COPY_##value_params                                \
-    GMOCK_ACTION_CLASS_(name, value_params)(                                   \
-        GMOCK_ACTION_CLASS_(name, value_params)&&) noexcept                    \
-        GMOCK_INTERNAL_DEFN_COPY_##value_params                                \
-    template <typename F>                                                      \
-    operator ::testing::Action<F>() const {                                    \
+                        GMOCK_INTERNAL_LIST_##value_params)){})                \
+            GMOCK_ACTION_CLASS_(name, value_params)(const GMOCK_ACTION_CLASS_( \
+                name, value_params) &) noexcept GMOCK_INTERNAL_DEFN_COPY_      \
+        ##value_params GMOCK_ACTION_CLASS_(name, value_params)(                \
+            GMOCK_ACTION_CLASS_(name, value_params) &&) noexcept               \
+        GMOCK_INTERNAL_DEFN_COPY_##value_params template <typename F>          \
+        operator ::testing::Action<F>() const {                                \
       return GMOCK_PP_IF(                                                      \
           GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),              \
-                      (::testing::internal::MakeAction<F, gmock_Impl>()),      \
-                      (::testing::internal::MakeAction<F>(impl_)));            \
+          (::testing::internal::MakeAction<F, gmock_Impl>()),                  \
+          (::testing::internal::MakeAction<F>(impl_)));                        \
     }                                                                          \
+                                                                               \
    private:                                                                    \
     class gmock_Impl {                                                         \
      public:                                                                   \
@@ -458,34 +545,35 @@
       return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;  \
       GMOCK_INTERNAL_DEFN_##value_params                                       \
     };                                                                         \
-    GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),        \
-                , std::shared_ptr<const gmock_Impl> impl_;)                    \
+    GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params), ,      \
+                std::shared_ptr<const gmock_Impl> impl_;)                      \
   };                                                                           \
   template <GMOCK_INTERNAL_DECL_##template_params                              \
-            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
-  GMOCK_ACTION_CLASS_(name, value_params)<                                     \
-      GMOCK_INTERNAL_LIST_##template_params                                    \
-      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(                           \
-          GMOCK_INTERNAL_DECL_##value_params) GTEST_MUST_USE_RESULT_;          \
+                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
+  GMOCK_ACTION_CLASS_(                                                         \
+      name, value_params)<GMOCK_INTERNAL_LIST_##template_params                \
+                              GMOCK_INTERNAL_LIST_TYPE_##value_params>         \
+      name(GMOCK_INTERNAL_DECL_##value_params) GTEST_MUST_USE_RESULT_;         \
   template <GMOCK_INTERNAL_DECL_##template_params                              \
-            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
-  inline GMOCK_ACTION_CLASS_(name, value_params)<                              \
-      GMOCK_INTERNAL_LIST_##template_params                                    \
-      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(                           \
-          GMOCK_INTERNAL_DECL_##value_params) {                                \
-    return GMOCK_ACTION_CLASS_(name, value_params)<                            \
-        GMOCK_INTERNAL_LIST_##template_params                                  \
-        GMOCK_INTERNAL_LIST_TYPE_##value_params>(                              \
-            GMOCK_INTERNAL_LIST_##value_params);                               \
+                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
+  inline GMOCK_ACTION_CLASS_(                                                  \
+      name, value_params)<GMOCK_INTERNAL_LIST_##template_params                \
+                              GMOCK_INTERNAL_LIST_TYPE_##value_params>         \
+  name(GMOCK_INTERNAL_DECL_##value_params) {                                   \
+    return GMOCK_ACTION_CLASS_(                                                \
+        name, value_params)<GMOCK_INTERNAL_LIST_##template_params              \
+                                GMOCK_INTERNAL_LIST_TYPE_##value_params>(      \
+        GMOCK_INTERNAL_LIST_##value_params);                                   \
   }                                                                            \
   template <GMOCK_INTERNAL_DECL_##template_params                              \
-            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
+                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
   template <typename function_type, typename return_type, typename args_type,  \
             GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                 \
-  return_type GMOCK_ACTION_CLASS_(name, value_params)<                         \
-      GMOCK_INTERNAL_LIST_##template_params                                    \
-      GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl::gmock_PerformImpl( \
-          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+  return_type GMOCK_ACTION_CLASS_(                                             \
+      name, value_params)<GMOCK_INTERNAL_LIST_##template_params                \
+                              GMOCK_INTERNAL_LIST_TYPE_##value_params>::       \
+      gmock_Impl::gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_)  \
+          const
 
 namespace testing {
 
@@ -495,8 +583,8 @@ namespace testing {
 // is expanded and macro expansion cannot contain #pragma.  Therefore
 // we suppress them here.
 #ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4100)
+#pragma warning(push)
+#pragma warning(disable : 4100)
 #endif
 
 namespace internal {
@@ -512,7 +600,8 @@ auto InvokeArgument(F f, Args... args) -> decltype(f(args...)) {
 
 template <std::size_t index, typename... Params>
 struct InvokeArgumentAction {
-  template <typename... Args>
+  template <typename... Args,
+            typename = typename std::enable_if<(index < sizeof...(Args))>::type>
   auto operator()(Args&&... args) const -> decltype(internal::InvokeArgument(
       std::get<index>(std::forward_as_tuple(std::forward<Args>(args)...)),
       std::declval<const Params&>()...)) {
@@ -565,7 +654,7 @@ InvokeArgument(Params&&... params) {
 }
 
 #ifdef _MSC_VER
-# pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 }  // namespace testing
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h b/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
index dfc77e359..47aaf9846 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements some matchers that depend on gmock-matchers.h.
@@ -35,7 +34,8 @@
 // Note that tests are implemented in gmock-matchers_test.cc rather than
 // gmock-more-matchers-test.cc.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
@@ -47,13 +47,13 @@ namespace testing {
 // Silence C4100 (unreferenced formal
 // parameter) for MSVC
 #ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4100)
+#pragma warning(push)
+#pragma warning(disable : 4100)
 #if (_MSC_VER == 1900)
 // and silence C4800 (C4800: 'int *const ': forcing value
 // to bool 'true' or 'false') for MSVC 14
-# pragma warning(disable:4800)
-  #endif
+#pragma warning(disable : 4800)
+#endif
 #endif
 
 // Defines a matcher that matches an empty container. The container must
@@ -83,10 +83,9 @@ MATCHER(IsFalse, negation ? "is true" : "is false") {
 }
 
 #ifdef _MSC_VER
-# pragma warning(pop)
+#pragma warning(pop)
 #endif
 
-
 }  // namespace testing
 
 #endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h b/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
index b03b770c7..4f0eb35db 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Implements class templates NiceMock, NaggyMock, and StrictMock.
 //
 // Given a mock class MockFoo that is created using Google Mock,
@@ -58,11 +57,13 @@
 // In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
 // supported.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
 
+#include <cstdint>
 #include <type_traits>
 
 #include "gmock/gmock-spec-builders.h"
@@ -109,25 +110,37 @@ constexpr bool HasStrictnessModifier() {
 template <typename Base>
 class NiceMockImpl {
  public:
-  NiceMockImpl() { ::testing::Mock::AllowUninterestingCalls(this); }
+  NiceMockImpl() {
+    ::testing::Mock::AllowUninterestingCalls(reinterpret_cast<uintptr_t>(this));
+  }
 
-  ~NiceMockImpl() { ::testing::Mock::UnregisterCallReaction(this); }
+  ~NiceMockImpl() {
+    ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
+  }
 };
 
 template <typename Base>
 class NaggyMockImpl {
  public:
-  NaggyMockImpl() { ::testing::Mock::WarnUninterestingCalls(this); }
+  NaggyMockImpl() {
+    ::testing::Mock::WarnUninterestingCalls(reinterpret_cast<uintptr_t>(this));
+  }
 
-  ~NaggyMockImpl() { ::testing::Mock::UnregisterCallReaction(this); }
+  ~NaggyMockImpl() {
+    ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
+  }
 };
 
 template <typename Base>
 class StrictMockImpl {
  public:
-  StrictMockImpl() { ::testing::Mock::FailUninterestingCalls(this); }
+  StrictMockImpl() {
+    ::testing::Mock::FailUninterestingCalls(reinterpret_cast<uintptr_t>(this));
+  }
 
-  ~StrictMockImpl() { ::testing::Mock::UnregisterCallReaction(this); }
+  ~StrictMockImpl() {
+    ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
+  }
 };
 
 }  // namespace internal
@@ -169,7 +182,8 @@ class GTEST_INTERNAL_EMPTY_BASE_CLASS NiceMock
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(NiceMock);
+  NiceMock(const NiceMock&) = delete;
+  NiceMock& operator=(const NiceMock&) = delete;
 };
 
 template <class MockClass>
@@ -210,7 +224,8 @@ class GTEST_INTERNAL_EMPTY_BASE_CLASS NaggyMock
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(NaggyMock);
+  NaggyMock(const NaggyMock&) = delete;
+  NaggyMock& operator=(const NaggyMock&) = delete;
 };
 
 template <class MockClass>
@@ -251,7 +266,8 @@ class GTEST_INTERNAL_EMPTY_BASE_CLASS StrictMock
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StrictMock);
+  StrictMock(const StrictMock&) = delete;
+  StrictMock& operator=(const StrictMock&) = delete;
 };
 
 #undef GTEST_INTERNAL_EMPTY_BASE_CLASS
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h b/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
index 41323c1cc..45cc60518 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements the ON_CALL() and EXPECT_CALL() macros.
@@ -56,11 +55,13 @@
 // where all clauses are optional, and .InSequence()/.After()/
 // .WillOnce() can appear any number of times.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
 
+#include <cstdint>
 #include <functional>
 #include <map>
 #include <memory>
@@ -70,6 +71,7 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "gmock/gmock-actions.h"
 #include "gmock/gmock-cardinalities.h"
 #include "gmock/gmock-matchers.h"
@@ -78,7 +80,7 @@
 #include "gtest/gtest.h"
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>  // NOLINT
+#include <stdexcept>  // NOLINT
 #endif
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
@@ -97,13 +99,15 @@ class ExpectationSet;
 namespace internal {
 
 // Implements a mock function.
-template <typename F> class FunctionMocker;
+template <typename F>
+class FunctionMocker;
 
 // Base class for expectations.
 class ExpectationBase;
 
 // Implements an expectation.
-template <typename F> class TypedExpectation;
+template <typename F>
+class TypedExpectation;
 
 // Helper class for testing the Expectation class template.
 class ExpectationTester;
@@ -129,9 +133,6 @@ class NaggyMockImpl;
 // calls to ensure the integrity of the mock objects' states.
 GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
 
-// Untyped base class for ActionResultHolder<R>.
-class UntypedActionResultHolderBase;
-
 // Abstract base class of FunctionMocker.  This is the
 // type-agnostic part of the function mocker interface.  Its pure
 // virtual methods are implemented by FunctionMocker.
@@ -154,27 +155,12 @@ class GTEST_API_ UntypedFunctionMockerBase {
   // responsibility to guarantee the correctness of the arguments'
   // types.
 
-  // Performs the default action with the given arguments and returns
-  // the action's result.  The call description string will be used in
-  // the error message to describe the call in the case the default
-  // action fails.
-  // L = *
-  virtual UntypedActionResultHolderBase* UntypedPerformDefaultAction(
-      void* untyped_args, const std::string& call_description) const = 0;
-
-  // Performs the given action with the given arguments and returns
-  // the action's result.
-  // L = *
-  virtual UntypedActionResultHolderBase* UntypedPerformAction(
-      const void* untyped_action, void* untyped_args) const = 0;
-
   // Writes a message that the call is uninteresting (i.e. neither
   // explicitly expected nor explicitly unexpected) to the given
   // ostream.
-  virtual void UntypedDescribeUninterestingCall(
-      const void* untyped_args,
-      ::std::ostream* os) const
-          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+  virtual void UntypedDescribeUninterestingCall(const void* untyped_args,
+                                                ::std::ostream* os) const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
 
   // Returns the expectation that matches the given function arguments
   // (or NULL is there's no match); when a match is found,
@@ -183,10 +169,9 @@ class GTEST_API_ UntypedFunctionMockerBase {
   // is_excessive is modified to indicate whether the call exceeds the
   // expected number.
   virtual const ExpectationBase* UntypedFindMatchingExpectation(
-      const void* untyped_args,
-      const void** untyped_action, bool* is_excessive,
+      const void* untyped_args, const void** untyped_action, bool* is_excessive,
       ::std::ostream* what, ::std::ostream* why)
-          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
 
   // Prints the given function arguments to the ostream.
   virtual void UntypedPrintArgs(const void* untyped_args,
@@ -196,8 +181,7 @@ class GTEST_API_ UntypedFunctionMockerBase {
   // this information in the global mock registry.  Will be called
   // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
   // method.
-  void RegisterOwner(const void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+  void RegisterOwner(const void* mock_obj) GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
 
   // Sets the mock object this mock method belongs to, and sets the
   // name of the mock function.  Will be called upon each invocation
@@ -208,20 +192,11 @@ class GTEST_API_ UntypedFunctionMockerBase {
   // Returns the mock object this mock method belongs to.  Must be
   // called after RegisterOwner() or SetOwnerAndName() has been
   // called.
-  const void* MockObject() const
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+  const void* MockObject() const GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
 
   // Returns the name of this mock method.  Must be called after
   // SetOwnerAndName() has been called.
-  const char* Name() const
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
-
-  // Returns the result of invoking this mock function with the given
-  // arguments.  This function can be safely called from multiple
-  // threads concurrently.  The caller is responsible for deleting the
-  // result.
-  UntypedActionResultHolderBase* UntypedInvokeWith(void* untyped_args)
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+  const char* Name() const GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
 
  protected:
   typedef std::vector<const void*> UntypedOnCallSpecs;
@@ -430,29 +405,28 @@ class GTEST_API_ Mock {
 
   // Tells Google Mock to allow uninteresting calls on the given mock
   // object.
-  static void AllowUninterestingCalls(const void* mock_obj)
+  static void AllowUninterestingCalls(uintptr_t mock_obj)
       GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Tells Google Mock to warn the user about uninteresting calls on
   // the given mock object.
-  static void WarnUninterestingCalls(const void* mock_obj)
+  static void WarnUninterestingCalls(uintptr_t mock_obj)
       GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Tells Google Mock to fail uninteresting calls on the given mock
   // object.
-  static void FailUninterestingCalls(const void* mock_obj)
+  static void FailUninterestingCalls(uintptr_t mock_obj)
       GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Tells Google Mock the given mock object is being destroyed and
   // its entry in the call-reaction table should be removed.
-  static void UnregisterCallReaction(const void* mock_obj)
+  static void UnregisterCallReaction(uintptr_t mock_obj)
       GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Returns the reaction Google Mock will have on uninteresting calls
   // made on the given mock object.
   static internal::CallReaction GetReactionOnUninterestingCalls(
-      const void* mock_obj)
-          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+      const void* mock_obj) GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Verifies that all expectations on the given mock object have been
   // satisfied.  Reports one or more Google Test non-fatal failures
@@ -465,17 +439,16 @@ class GTEST_API_ Mock {
       GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
 
   // Registers a mock object and a mock method it owns.
-  static void Register(
-      const void* mock_obj,
-      internal::UntypedFunctionMockerBase* mocker)
-          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  static void Register(const void* mock_obj,
+                       internal::UntypedFunctionMockerBase* mocker)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Tells Google Mock where in the source code mock_obj is used in an
   // ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
   // information helps the user identify which object it is.
-  static void RegisterUseByOnCallOrExpectCall(
-      const void* mock_obj, const char* file, int line)
-          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  static void RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+                                              const char* file, int line)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
 
   // Unregisters a mock method; removes the owning mock object from
   // the registry when the last mock method associated with it has
@@ -632,7 +605,6 @@ class ExpectationSet {
   Expectation::Set expectations_;
 };
 
-
 // Sequence objects are used by a user to specify the relative order
 // in which the expectations should match.  They are copyable (we rely
 // on the compiler-defined copy constructor and assignment operator).
@@ -678,10 +650,12 @@ class GTEST_API_ InSequence {
  public:
   InSequence();
   ~InSequence();
+
  private:
   bool sequence_created_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InSequence);  // NOLINT
+  InSequence(const InSequence&) = delete;
+  InSequence& operator=(const InSequence&) = delete;
 } GTEST_ATTRIBUTE_UNUSED_;
 
 namespace internal {
@@ -784,40 +758,34 @@ class GTEST_API_ ExpectationBase {
   // the current thread.
 
   // Retires all pre-requisites of this expectation.
-  void RetireAllPreRequisites()
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+  void RetireAllPreRequisites() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
 
   // Returns true if and only if this expectation is retired.
-  bool is_retired() const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  bool is_retired() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     return retired_;
   }
 
   // Retires this expectation.
-  void Retire()
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  void Retire() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     retired_ = true;
   }
 
   // Returns true if and only if this expectation is satisfied.
-  bool IsSatisfied() const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  bool IsSatisfied() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     return cardinality().IsSatisfiedByCallCount(call_count_);
   }
 
   // Returns true if and only if this expectation is saturated.
-  bool IsSaturated() const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  bool IsSaturated() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     return cardinality().IsSaturatedByCallCount(call_count_);
   }
 
   // Returns true if and only if this expectation is over-saturated.
-  bool IsOverSaturated() const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  bool IsOverSaturated() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     return cardinality().IsOverSaturatedByCallCount(call_count_);
   }
@@ -832,15 +800,13 @@ class GTEST_API_ ExpectationBase {
       GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
 
   // Returns the number this expectation has been invoked.
-  int call_count() const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  int call_count() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     return call_count_;
   }
 
   // Increments the number this expectation has been invoked.
-  void IncrementCallCount()
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  void IncrementCallCount() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     call_count_++;
   }
@@ -849,8 +815,7 @@ class GTEST_API_ ExpectationBase {
   // WillRepeatedly() clauses) against the cardinality if this hasn't
   // been done before.  Prints a warning if there are too many or too
   // few actions.
-  void CheckActionCountIfNotDone() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void CheckActionCountIfNotDone() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   friend class ::testing::Sequence;
   friend class ::testing::internal::ExpectationTester;
@@ -863,12 +828,12 @@ class GTEST_API_ ExpectationBase {
 
   // This group of fields are part of the spec and won't change after
   // an EXPECT_CALL() statement finishes.
-  const char* file_;          // The file that contains the expectation.
-  int line_;                  // The line number of the expectation.
+  const char* file_;               // The file that contains the expectation.
+  int line_;                       // The line number of the expectation.
   const std::string source_text_;  // The EXPECT_CALL(...) source text.
   // True if and only if the cardinality is specified explicitly.
   bool cardinality_specified_;
-  Cardinality cardinality_;            // The cardinality of the expectation.
+  Cardinality cardinality_;  // The cardinality of the expectation.
   // The immediate pre-requisites (i.e. expectations that must be
   // satisfied before this expectation can be matched) of this
   // expectation.  We use std::shared_ptr in the set because we want an
@@ -887,12 +852,18 @@ class GTEST_API_ ExpectationBase {
   bool retires_on_saturation_;
   Clause last_clause_;
   mutable bool action_count_checked_;  // Under mutex_.
-  mutable Mutex mutex_;  // Protects action_count_checked_.
-};  // class ExpectationBase
+  mutable Mutex mutex_;                // Protects action_count_checked_.
+};                                     // class ExpectationBase
 
-// Impements an expectation for the given function type.
 template <typename F>
-class TypedExpectation : public ExpectationBase {
+class TypedExpectation;
+
+// Implements an expectation for the given function type.
+template <typename R, typename... Args>
+class TypedExpectation<R(Args...)> : public ExpectationBase {
+ private:
+  using F = R(Args...);
+
  public:
   typedef typename Function<F>::ArgumentTuple ArgumentTuple;
   typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
@@ -945,9 +916,7 @@ class TypedExpectation : public ExpectationBase {
   }
 
   // Implements the .Times() clause.
-  TypedExpectation& Times(int n) {
-    return Times(Exactly(n));
-  }
+  TypedExpectation& Times(int n) { return Times(Exactly(n)); }
 
   // Implements the .InSequence() clause.
   TypedExpectation& InSequence(const Sequence& s) {
@@ -1007,14 +976,31 @@ class TypedExpectation : public ExpectationBase {
     return After(s1, s2, s3, s4).After(s5);
   }
 
-  // Implements the .WillOnce() clause.
-  TypedExpectation& WillOnce(const Action<F>& action) {
+  // Preferred, type-safe overload: consume anything that can be directly
+  // converted to a OnceAction, except for Action<F> objects themselves.
+  TypedExpectation& WillOnce(OnceAction<F> once_action) {
+    // Call the overload below, smuggling the OnceAction as a copyable callable.
+    // We know this is safe because a WillOnce action will not be called more
+    // than once.
+    return WillOnce(Action<F>(ActionAdaptor{
+        std::make_shared<OnceAction<F>>(std::move(once_action)),
+    }));
+  }
+
+  // Fallback overload: accept Action<F> objects and those actions that define
+  // `operator Action<F>` but not `operator OnceAction<F>`.
+  //
+  // This is templated in order to cause the overload above to be preferred
+  // when the input is convertible to either type.
+  template <int&... ExplicitArgumentBarrier, typename = void>
+  TypedExpectation& WillOnce(Action<F> action) {
     ExpectSpecProperty(last_clause_ <= kWillOnce,
                        ".WillOnce() cannot appear after "
                        ".WillRepeatedly() or .RetiresOnSaturation().");
     last_clause_ = kWillOnce;
 
-    untyped_actions_.push_back(new Action<F>(action));
+    untyped_actions_.push_back(new Action<F>(std::move(action)));
+
     if (!cardinality_specified()) {
       set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
     }
@@ -1062,9 +1048,7 @@ class TypedExpectation : public ExpectationBase {
 
   // Returns the matchers for the arguments as specified inside the
   // EXPECT_CALL() macro.
-  const ArgumentMatcherTuple& matchers() const {
-    return matchers_;
-  }
+  const ArgumentMatcherTuple& matchers() const { return matchers_; }
 
   // Returns the matcher specified by the .With() clause.
   const Matcher<const ArgumentTuple&>& extra_matcher() const {
@@ -1088,6 +1072,16 @@ class TypedExpectation : public ExpectationBase {
   template <typename Function>
   friend class FunctionMocker;
 
+  // An adaptor that turns a OneAction<F> into something compatible with
+  // Action<F>. Must be called at most once.
+  struct ActionAdaptor {
+    std::shared_ptr<OnceAction<R(Args...)>> once_action;
+
+    R operator()(Args&&... args) const {
+      return std::move(*once_action).Call(std::forward<Args>(args)...);
+    }
+  };
+
   // Returns an Expectation object that references and co-owns this
   // expectation.
   Expectation GetHandle() override { return owner_->GetHandleOf(this); }
@@ -1119,10 +1113,8 @@ class TypedExpectation : public ExpectationBase {
 
   // Describes the result of matching the arguments against this
   // expectation to the given ostream.
-  void ExplainMatchResultTo(
-      const ArgumentTuple& args,
-      ::std::ostream* os) const
-          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  void ExplainMatchResultTo(const ArgumentTuple& args, ::std::ostream* os) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
 
     if (is_retired()) {
@@ -1181,9 +1173,9 @@ class TypedExpectation : public ExpectationBase {
       ::std::stringstream ss;
       DescribeLocationTo(&ss);
       ss << "Actions ran out in " << source_text() << "...\n"
-         << "Called " << count << " times, but only "
-         << action_count << " WillOnce()"
-         << (action_count == 1 ? " is" : "s are") << " specified - ";
+         << "Called " << count << " times, but only " << action_count
+         << " WillOnce()" << (action_count == 1 ? " is" : "s are")
+         << " specified - ";
       mocker->DescribeDefaultActionTo(args, &ss);
       Log(kWarning, ss.str(), 1);
     }
@@ -1225,7 +1217,7 @@ class TypedExpectation : public ExpectationBase {
     }
 
     // Must be done after IncrementCount()!
-    *what << "Mock function call matches " << source_text() <<"...\n";
+    *what << "Mock function call matches " << source_text() << "...\n";
     return &(GetCurrentAction(mocker, args));
   }
 
@@ -1236,7 +1228,8 @@ class TypedExpectation : public ExpectationBase {
   Matcher<const ArgumentTuple&> extra_matcher_;
   Action<F> repeated_action_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TypedExpectation);
+  TypedExpectation(const TypedExpectation&) = delete;
+  TypedExpectation& operator=(const TypedExpectation&) = delete;
 };  // class TypedExpectation
 
 // A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
@@ -1258,8 +1251,8 @@ template <typename F>
 class MockSpec {
  public:
   typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
-  typedef typename internal::Function<F>::ArgumentMatcherTuple
-      ArgumentMatcherTuple;
+  typedef
+      typename internal::Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
 
   // Constructs a MockSpec object, given the function mocker object
   // that the spec is associated with.
@@ -1269,8 +1262,9 @@ class MockSpec {
 
   // Adds a new default action spec to the function mocker and returns
   // the newly created spec.
-  internal::OnCallSpec<F>& InternalDefaultActionSetAt(
-      const char* file, int line, const char* obj, const char* call) {
+  internal::OnCallSpec<F>& InternalDefaultActionSetAt(const char* file,
+                                                      int line, const char* obj,
+                                                      const char* call) {
     LogWithLocation(internal::kInfo, file, line,
                     std::string("ON_CALL(") + obj + ", " + call + ") invoked");
     return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
@@ -1278,13 +1272,14 @@ class MockSpec {
 
   // Adds a new expectation spec to the function mocker and returns
   // the newly created spec.
-  internal::TypedExpectation<F>& InternalExpectedAt(
-      const char* file, int line, const char* obj, const char* call) {
+  internal::TypedExpectation<F>& InternalExpectedAt(const char* file, int line,
+                                                    const char* obj,
+                                                    const char* call) {
     const std::string source_text(std::string("EXPECT_CALL(") + obj + ", " +
                                   call + ")");
     LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
-    return function_mocker_->AddNewExpectation(
-        file, line, source_text, matchers_);
+    return function_mocker_->AddNewExpectation(file, line, source_text,
+                                               matchers_);
   }
 
   // This operator overload is used to swallow the superfluous parameter list
@@ -1317,9 +1312,7 @@ template <typename T>
 class ReferenceOrValueWrapper {
  public:
   // Constructs a wrapper from the given value/reference.
-  explicit ReferenceOrValueWrapper(T value)
-      : value_(std::move(value)) {
-  }
+  explicit ReferenceOrValueWrapper(T value) : value_(std::move(value)) {}
 
   // Unwraps and returns the underlying value/reference, exactly as
   // originally passed. The behavior of calling this more than once on
@@ -1330,9 +1323,7 @@ class ReferenceOrValueWrapper {
   // Always returns a const reference (more precisely,
   // const std::add_lvalue_reference<T>::type). The behavior of calling this
   // after calling Unwrap on the same object is unspecified.
-  const T& Peek() const {
-    return value_;
-  }
+  const T& Peek() const { return value_; }
 
  private:
   T value_;
@@ -1346,8 +1337,7 @@ class ReferenceOrValueWrapper<T&> {
   // Workaround for debatable pass-by-reference lint warning (c-library-team
   // policy precludes NOLINT in this context)
   typedef T& reference;
-  explicit ReferenceOrValueWrapper(reference ref)
-      : value_ptr_(&ref) {}
+  explicit ReferenceOrValueWrapper(reference ref) : value_ptr_(&ref) {}
   T& Unwrap() { return *value_ptr_; }
   const T& Peek() const { return *value_ptr_; }
 
@@ -1355,102 +1345,27 @@ class ReferenceOrValueWrapper<T&> {
   T* value_ptr_;
 };
 
-// C++ treats the void type specially.  For example, you cannot define
-// a void-typed variable or pass a void value to a function.
-// ActionResultHolder<T> holds a value of type T, where T must be a
-// copyable type or void (T doesn't need to be default-constructable).
-// It hides the syntactic difference between void and other types, and
-// is used to unify the code for invoking both void-returning and
-// non-void-returning mock functions.
-
-// Untyped base class for ActionResultHolder<T>.
-class UntypedActionResultHolderBase {
- public:
-  virtual ~UntypedActionResultHolderBase() {}
-
-  // Prints the held value as an action's result to os.
-  virtual void PrintAsActionResult(::std::ostream* os) const = 0;
-};
-
-// This generic definition is used when T is not void.
+// Prints the held value as an action's result to os.
 template <typename T>
-class ActionResultHolder : public UntypedActionResultHolderBase {
- public:
-  // Returns the held value. Must not be called more than once.
-  T Unwrap() {
-    return result_.Unwrap();
-  }
-
-  // Prints the held value as an action's result to os.
-  void PrintAsActionResult(::std::ostream* os) const override {
-    *os << "\n          Returns: ";
-    // T may be a reference type, so we don't use UniversalPrint().
-    UniversalPrinter<T>::Print(result_.Peek(), os);
-  }
-
-  // Performs the given mock function's default action and returns the
-  // result in a new-ed ActionResultHolder.
-  template <typename F>
-  static ActionResultHolder* PerformDefaultAction(
-      const FunctionMocker<F>* func_mocker,
-      typename Function<F>::ArgumentTuple&& args,
-      const std::string& call_description) {
-    return new ActionResultHolder(Wrapper(func_mocker->PerformDefaultAction(
-        std::move(args), call_description)));
-  }
-
-  // Performs the given action and returns the result in a new-ed
-  // ActionResultHolder.
-  template <typename F>
-  static ActionResultHolder* PerformAction(
-      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
-    return new ActionResultHolder(
-        Wrapper(action.Perform(std::move(args))));
-  }
-
- private:
-  typedef ReferenceOrValueWrapper<T> Wrapper;
-
-  explicit ActionResultHolder(Wrapper result)
-      : result_(std::move(result)) {
-  }
-
-  Wrapper result_;
+void PrintAsActionResult(const T& result, std::ostream& os) {
+  os << "\n          Returns: ";
+  // T may be a reference type, so we don't use UniversalPrint().
+  UniversalPrinter<T>::Print(result, &os);
+}
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
-};
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+GTEST_API_ void ReportUninterestingCall(CallReaction reaction,
+                                        const std::string& msg);
 
-// Specialization for T = void.
-template <>
-class ActionResultHolder<void> : public UntypedActionResultHolderBase {
+// A generic RAII type that runs a user-provided function in its destructor.
+class Cleanup final {
  public:
-  void Unwrap() { }
-
-  void PrintAsActionResult(::std::ostream* /* os */) const override {}
-
-  // Performs the given mock function's default action and returns ownership
-  // of an empty ActionResultHolder*.
-  template <typename F>
-  static ActionResultHolder* PerformDefaultAction(
-      const FunctionMocker<F>* func_mocker,
-      typename Function<F>::ArgumentTuple&& args,
-      const std::string& call_description) {
-    func_mocker->PerformDefaultAction(std::move(args), call_description);
-    return new ActionResultHolder;
-  }
-
-  // Performs the given action and returns ownership of an empty
-  // ActionResultHolder*.
-  template <typename F>
-  static ActionResultHolder* PerformAction(
-      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
-    action.Perform(std::move(args));
-    return new ActionResultHolder;
-  }
+  explicit Cleanup(std::function<void()> f) : f_(std::move(f)) {}
+  ~Cleanup() { f_(); }
 
  private:
-  ActionResultHolder() {}
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+  std::function<void()> f_;
 };
 
 template <typename F>
@@ -1495,14 +1410,12 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
   // Returns the ON_CALL spec that matches this mock function with the
   // given arguments; returns NULL if no matching ON_CALL is found.
   // L = *
-  const OnCallSpec<F>* FindOnCallSpec(
-      const ArgumentTuple& args) const {
-    for (UntypedOnCallSpecs::const_reverse_iterator it
-             = untyped_on_call_specs_.rbegin();
+  const OnCallSpec<F>* FindOnCallSpec(const ArgumentTuple& args) const {
+    for (UntypedOnCallSpecs::const_reverse_iterator it =
+             untyped_on_call_specs_.rbegin();
          it != untyped_on_call_specs_.rend(); ++it) {
       const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
-      if (spec->Matches(args))
-        return spec;
+      if (spec->Matches(args)) return spec;
     }
 
     return nullptr;
@@ -1510,15 +1423,14 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
 
   // Performs the default action of this mock function on the given
   // arguments and returns the result. Asserts (or throws if
-  // exceptions are enabled) with a helpful call descrption if there
+  // exceptions are enabled) with a helpful call description if there
   // is no valid return value. This method doesn't depend on the
   // mutable state of this object, and thus can be called concurrently
   // without locking.
   // L = *
   Result PerformDefaultAction(ArgumentTuple&& args,
                               const std::string& call_description) const {
-    const OnCallSpec<F>* const spec =
-        this->FindOnCallSpec(args);
+    const OnCallSpec<F>* const spec = this->FindOnCallSpec(args);
     if (spec != nullptr) {
       return spec->GetAction().Perform(std::move(args));
     }
@@ -1536,32 +1448,6 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
     return DefaultValue<Result>::Get();
   }
 
-  // Performs the default action with the given arguments and returns
-  // the action's result.  The call description string will be used in
-  // the error message to describe the call in the case the default
-  // action fails.  The caller is responsible for deleting the result.
-  // L = *
-  UntypedActionResultHolderBase* UntypedPerformDefaultAction(
-      void* untyped_args,  // must point to an ArgumentTuple
-      const std::string& call_description) const override {
-    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
-    return ResultHolder::PerformDefaultAction(this, std::move(*args),
-                                              call_description);
-  }
-
-  // Performs the given action with the given arguments and returns
-  // the action's result.  The caller is responsible for deleting the
-  // result.
-  // L = *
-  UntypedActionResultHolderBase* UntypedPerformAction(
-      const void* untyped_action, void* untyped_args) const override {
-    // Make a copy of the action before performing it, in case the
-    // action deletes the mock object (and thus deletes itself).
-    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
-    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
-    return ResultHolder::PerformAction(action, std::move(*args));
-  }
-
   // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
   // clears the ON_CALL()s set on this mock function.
   void ClearDefaultActionsLocked() override
@@ -1579,8 +1465,7 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
     untyped_on_call_specs_.swap(specs_to_delete);
 
     g_gmock_mutex.Unlock();
-    for (UntypedOnCallSpecs::const_iterator it =
-             specs_to_delete.begin();
+    for (UntypedOnCallSpecs::const_iterator it = specs_to_delete.begin();
          it != specs_to_delete.end(); ++it) {
       delete static_cast<const OnCallSpec<F>*>(*it);
     }
@@ -1594,10 +1479,7 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
   // arguments.  This function can be safely called from multiple
   // threads concurrently.
   Result Invoke(Args... args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    ArgumentTuple tuple(std::forward<Args>(args)...);
-    std::unique_ptr<ResultHolder> holder(DownCast_<ResultHolder*>(
-        this->UntypedInvokeWith(static_cast<void*>(&tuple))));
-    return holder->Unwrap();
+    return InvokeWith(ArgumentTuple(std::forward<Args>(args)...));
   }
 
   MockSpec<F> With(Matcher<Args>... m) {
@@ -1608,13 +1490,10 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
   template <typename Function>
   friend class MockSpec;
 
-  typedef ActionResultHolder<Result> ResultHolder;
-
   // Adds and returns a default action spec for this mock function.
-  OnCallSpec<F>& AddNewOnCallSpec(
-      const char* file, int line,
-      const ArgumentMatcherTuple& m)
-          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  OnCallSpec<F>& AddNewOnCallSpec(const char* file, int line,
+                                  const ArgumentMatcherTuple& m)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
     Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
     OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
     untyped_on_call_specs_.push_back(on_call_spec);
@@ -1644,7 +1523,8 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
   }
 
  private:
-  template <typename Func> friend class TypedExpectation;
+  template <typename Func>
+  friend class TypedExpectation;
 
   // Some utilities needed for implementing UntypedInvokeWith().
 
@@ -1728,9 +1608,8 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
 
   // Returns the expectation that matches the arguments, or NULL if no
   // expectation matches them.
-  TypedExpectation<F>* FindMatchingExpectationLocked(
-      const ArgumentTuple& args) const
-          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  TypedExpectation<F>* FindMatchingExpectationLocked(const ArgumentTuple& args)
+      const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     // See the definition of untyped_expectations_ for why access to
     // it is unprotected here.
@@ -1747,11 +1626,10 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
   }
 
   // Returns a message that the arguments don't match any expectation.
-  void FormatUnexpectedCallMessageLocked(
-      const ArgumentTuple& args,
-      ::std::ostream* os,
-      ::std::ostream* why) const
-          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  void FormatUnexpectedCallMessageLocked(const ArgumentTuple& args,
+                                         ::std::ostream* os,
+                                         ::std::ostream* why) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     *os << "\nUnexpected mock function call - ";
     DescribeDefaultActionTo(args, os);
@@ -1760,15 +1638,14 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
 
   // Prints a list of expectations that have been tried against the
   // current mock function call.
-  void PrintTriedExpectationsLocked(
-      const ArgumentTuple& args,
-      ::std::ostream* why) const
-          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  void PrintTriedExpectationsLocked(const ArgumentTuple& args,
+                                    ::std::ostream* why) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
     g_gmock_mutex.AssertHeld();
     const size_t count = untyped_expectations_.size();
     *why << "Google Mock tried the following " << count << " "
-         << (count == 1 ? "expectation, but it didn't match" :
-             "expectations, but none matched")
+         << (count == 1 ? "expectation, but it didn't match"
+                        : "expectations, but none matched")
          << ":\n";
     for (size_t i = 0; i < count; i++) {
       TypedExpectation<F>* const expectation =
@@ -1783,11 +1660,177 @@ class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
       expectation->DescribeCallCountTo(why);
     }
   }
+
+  // Performs the given action (or the default if it's null) with the given
+  // arguments and returns the action's result.
+  // L = *
+  R PerformAction(const void* untyped_action, ArgumentTuple&& args,
+                  const std::string& call_description) const {
+    if (untyped_action == nullptr) {
+      return PerformDefaultAction(std::move(args), call_description);
+    }
+
+    // Make a copy of the action before performing it, in case the
+    // action deletes the mock object (and thus deletes itself).
+    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
+    return action.Perform(std::move(args));
+  }
+
+  // Is it possible to store an object of the supplied type in a local variable
+  // for the sake of printing it, then return it on to the caller?
+  template <typename T>
+  using can_print_result = internal::conjunction<
+      // void can't be stored as an object (and we also don't need to print it).
+      internal::negation<std::is_void<T>>,
+      // Non-moveable types can't be returned on to the user, so there's no way
+      // for us to intercept and print them.
+      std::is_move_constructible<T>>;
+
+  // Perform the supplied action, printing the result to os.
+  template <typename T = R,
+            typename std::enable_if<can_print_result<T>::value, int>::type = 0>
+  R PerformActionAndPrintResult(const void* const untyped_action,
+                                ArgumentTuple&& args,
+                                const std::string& call_description,
+                                std::ostream& os) {
+    R result = PerformAction(untyped_action, std::move(args), call_description);
+
+    PrintAsActionResult(result, os);
+    return std::forward<R>(result);
+  }
+
+  // An overload for when it's not possible to print the result. In this case we
+  // simply perform the action.
+  template <typename T = R,
+            typename std::enable_if<
+                internal::negation<can_print_result<T>>::value, int>::type = 0>
+  R PerformActionAndPrintResult(const void* const untyped_action,
+                                ArgumentTuple&& args,
+                                const std::string& call_description,
+                                std::ostream&) {
+    return PerformAction(untyped_action, std::move(args), call_description);
+  }
+
+  // Returns the result of invoking this mock function with the given
+  // arguments. This function can be safely called from multiple
+  // threads concurrently.
+  R InvokeWith(ArgumentTuple&& args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
 };  // class FunctionMocker
 
-// Reports an uninteresting call (whose description is in msg) in the
-// manner specified by 'reaction'.
-void ReportUninterestingCall(CallReaction reaction, const std::string& msg);
+// Calculates the result of invoking this mock function with the given
+// arguments, prints it, and returns it.
+template <typename R, typename... Args>
+R FunctionMocker<R(Args...)>::InvokeWith(ArgumentTuple&& args)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  if (untyped_expectations_.size() == 0) {
+    // No expectation is set on this mock method - we have an
+    // uninteresting call.
+
+    // We must get Google Mock's reaction on uninteresting calls
+    // made on this mock object BEFORE performing the action,
+    // because the action may DELETE the mock object and make the
+    // following expression meaningless.
+    const CallReaction reaction =
+        Mock::GetReactionOnUninterestingCalls(MockObject());
+
+    // True if and only if we need to print this call's arguments and return
+    // value.  This definition must be kept in sync with
+    // the behavior of ReportUninterestingCall().
+    const bool need_to_report_uninteresting_call =
+        // If the user allows this uninteresting call, we print it
+        // only when they want informational messages.
+        reaction == kAllow ? LogIsVisible(kInfo) :
+                           // If the user wants this to be a warning, we print
+                           // it only when they want to see warnings.
+            reaction == kWarn
+            ? LogIsVisible(kWarning)
+            :
+            // Otherwise, the user wants this to be an error, and we
+            // should always print detailed information in the error.
+            true;
+
+    if (!need_to_report_uninteresting_call) {
+      // Perform the action without printing the call information.
+      return this->PerformDefaultAction(
+          std::move(args), "Function call: " + std::string(Name()));
+    }
+
+    // Warns about the uninteresting call.
+    ::std::stringstream ss;
+    this->UntypedDescribeUninterestingCall(&args, &ss);
+
+    // Perform the action, print the result, and then report the uninteresting
+    // call.
+    //
+    // We use RAII to do the latter in case R is void or a non-moveable type. In
+    // either case we can't assign it to a local variable.
+    const Cleanup report_uninteresting_call(
+        [&] { ReportUninterestingCall(reaction, ss.str()); });
+
+    return PerformActionAndPrintResult(nullptr, std::move(args), ss.str(), ss);
+  }
+
+  bool is_excessive = false;
+  ::std::stringstream ss;
+  ::std::stringstream why;
+  ::std::stringstream loc;
+  const void* untyped_action = nullptr;
+
+  // The UntypedFindMatchingExpectation() function acquires and
+  // releases g_gmock_mutex.
+
+  const ExpectationBase* const untyped_expectation =
+      this->UntypedFindMatchingExpectation(&args, &untyped_action,
+                                           &is_excessive, &ss, &why);
+  const bool found = untyped_expectation != nullptr;
+
+  // True if and only if we need to print the call's arguments
+  // and return value.
+  // This definition must be kept in sync with the uses of Expect()
+  // and Log() in this function.
+  const bool need_to_report_call =
+      !found || is_excessive || LogIsVisible(kInfo);
+  if (!need_to_report_call) {
+    // Perform the action without printing the call information.
+    return PerformAction(untyped_action, std::move(args), "");
+  }
+
+  ss << "    Function call: " << Name();
+  this->UntypedPrintArgs(&args, &ss);
+
+  // In case the action deletes a piece of the expectation, we
+  // generate the message beforehand.
+  if (found && !is_excessive) {
+    untyped_expectation->DescribeLocationTo(&loc);
+  }
+
+  // Perform the action, print the result, and then fail or log in whatever way
+  // is appropriate.
+  //
+  // We use RAII to do the latter in case R is void or a non-moveable type. In
+  // either case we can't assign it to a local variable.
+  const Cleanup handle_failures([&] {
+    ss << "\n" << why.str();
+
+    if (!found) {
+      // No expectation matches this call - reports a failure.
+      Expect(false, nullptr, -1, ss.str());
+    } else if (is_excessive) {
+      // We had an upper-bound violation and the failure message is in ss.
+      Expect(false, untyped_expectation->file(), untyped_expectation->line(),
+             ss.str());
+    } else {
+      // We had an expected call and the matching expectation is
+      // described in ss.
+      Log(kInfo, loc.str() + ss.str(), 2);
+    }
+  });
+
+  return PerformActionAndPrintResult(untyped_action, std::move(args), ss.str(),
+                                     ss);
+}
 
 }  // namespace internal
 
@@ -1952,7 +1995,9 @@ using internal::MockSpec;
 //   // Expects a call to const MockFoo::Bar().
 //   EXPECT_CALL(Const(foo), Bar());
 template <typename T>
-inline const T& Const(const T& x) { return x; }
+inline const T& Const(const T& x) {
+  return x;
+}
 
 // Constructs an Expectation object that references and co-owns exp.
 inline Expectation::Expectation(internal::ExpectationBase& exp)  // NOLINT
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock.h b/third_party/googletest/src/googlemock/include/gmock/gmock.h
index 12469bc46..568c8c71d 100644
--- a/third_party/googletest/src/googlemock/include/gmock/gmock.h
+++ b/third_party/googletest/src/googlemock/include/gmock/gmock.h
@@ -27,13 +27,10 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This is the main header file a user should include.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
-
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
 
@@ -64,14 +61,15 @@
 #include "gmock/gmock-more-matchers.h"
 #include "gmock/gmock-nice-strict.h"
 #include "gmock/internal/gmock-internal-utils.h"
-
-namespace testing {
+#include "gmock/internal/gmock-port.h"
 
 // Declares Google Mock flags that we want a user to use programmatically.
 GMOCK_DECLARE_bool_(catch_leaked_mocks);
 GMOCK_DECLARE_string_(verbose);
 GMOCK_DECLARE_int32_(default_mock_behavior);
 
+namespace testing {
+
 // Initializes Google Mock.  This must be called before running the
 // tests.  In particular, it parses the command line for the flags
 // that Google Mock recognizes.  Whenever a Google Mock flag is seen,
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md b/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
index f6c93f616..9c4874fd0 100644
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
+++ b/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
@@ -14,3 +14,5 @@ The following macros can be defined:
 *   `GMOCK_DEFINE_bool_(name, default_val, doc)`
 *   `GMOCK_DEFINE_int32_(name, default_val, doc)`
 *   `GMOCK_DEFINE_string_(name, default_val, doc)`
+*   `GMOCK_FLAG_GET(flag_name)`
+*   `GMOCK_FLAG_SET(flag_name, value)`
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
index 63f899962..bbcad31c7 100644
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
+++ b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
@@ -1,4 +1,5 @@
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
index 638429488..bb7dcbaa4 100644
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
+++ b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
@@ -26,10 +26,11 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Injection point for custom user configurations. See README for details
-//
-// GOOGLETEST_CM0002 DO NOT DELETE
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
index 14378692a..f055f7506 100644
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
+++ b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
@@ -26,12 +26,13 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
index 317544a7d..b1343fdc8 100644
--- a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
+++ b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
@@ -27,22 +27,25 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file defines some utilities useful for implementing Google
 // Mock.  They are subject to change without notice, so please DO NOT
 // USE THEM IN USER CODE.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
 
 #include <stdio.h>
+
 #include <ostream>  // NOLINT
 #include <string>
 #include <type_traits>
+#include <vector>
+
 #include "gmock/internal/gmock-port.h"
 #include "gtest/gtest.h"
 
@@ -56,14 +59,15 @@ namespace internal {
 // Silence MSVC C4100 (unreferenced formal parameter) and
 // C4805('==': unsafe mix of type 'const int' and type 'const bool')
 #ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4100)
-# pragma warning(disable:4805)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4805)
 #endif
 
 // Joins a vector of strings as if they are fields of a tuple; returns
 // the joined string.
-GTEST_API_ std::string JoinAsTuple(const Strings& fields);
+GTEST_API_ std::string JoinAsKeyValueTuple(
+    const std::vector<const char*>& names, const Strings& values);
 
 // Converts an identifier name to a space-separated list of lower-case
 // words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
@@ -78,9 +82,18 @@ template <typename Pointer>
 inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
   return p.get();
 }
+// This overload version is for std::reference_wrapper, which does not work with
+// the overload above, as it does not have an `element_type`.
+template <typename Element>
+inline const Element* GetRawPointer(const std::reference_wrapper<Element>& r) {
+  return &r.get();
+}
+
 // This overloaded version is for the raw pointer case.
 template <typename Element>
-inline Element* GetRawPointer(Element* p) { return p; }
+inline Element* GetRawPointer(Element* p) {
+  return p;
+}
 
 // MSVC treats wchar_t as a native type usually, but treats it as the
 // same as unsigned short when the compiler option /Zc:wchar_t- is
@@ -89,7 +102,7 @@ inline Element* GetRawPointer(Element* p) { return p; }
 #if defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED)
 // wchar_t is a typedef.
 #else
-# define GMOCK_WCHAR_T_IS_NATIVE_ 1
+#define GMOCK_WCHAR_T_IS_NATIVE_ 1
 #endif
 
 // In what follows, we use the term "kind" to indicate whether a type
@@ -97,18 +110,20 @@ inline Element* GetRawPointer(Element* p) { return p; }
 // or none of them.  This categorization is useful for determining
 // when a matcher argument type can be safely converted to another
 // type in the implementation of SafeMatcherCast.
-enum TypeKind {
-  kBool, kInteger, kFloatingPoint, kOther
-};
+enum TypeKind { kBool, kInteger, kFloatingPoint, kOther };
 
 // KindOf<T>::value is the kind of type T.
-template <typename T> struct KindOf {
+template <typename T>
+struct KindOf {
   enum { value = kOther };  // The default kind.
 };
 
 // This macro declares that the kind of 'type' is 'kind'.
 #define GMOCK_DECLARE_KIND_(type, kind) \
-  template <> struct KindOf<type> { enum { value = kind }; }
+  template <>                           \
+  struct KindOf<type> {                 \
+    enum { value = kind };              \
+  }
 
 GMOCK_DECLARE_KIND_(bool, kBool);
 
@@ -116,13 +131,13 @@ GMOCK_DECLARE_KIND_(bool, kBool);
 GMOCK_DECLARE_KIND_(char, kInteger);
 GMOCK_DECLARE_KIND_(signed char, kInteger);
 GMOCK_DECLARE_KIND_(unsigned char, kInteger);
-GMOCK_DECLARE_KIND_(short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(short, kInteger);           // NOLINT
 GMOCK_DECLARE_KIND_(unsigned short, kInteger);  // NOLINT
 GMOCK_DECLARE_KIND_(int, kInteger);
 GMOCK_DECLARE_KIND_(unsigned int, kInteger);
-GMOCK_DECLARE_KIND_(long, kInteger);  // NOLINT
-GMOCK_DECLARE_KIND_(unsigned long, kInteger);  // NOLINT
-GMOCK_DECLARE_KIND_(long long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(long, kInteger);                // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long, kInteger);       // NOLINT
+GMOCK_DECLARE_KIND_(long long, kInteger);           // NOLINT
 GMOCK_DECLARE_KIND_(unsigned long long, kInteger);  // NOLINT
 
 #if GMOCK_WCHAR_T_IS_NATIVE_
@@ -137,7 +152,7 @@ GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
 #undef GMOCK_DECLARE_KIND_
 
 // Evaluates to the kind of 'type'.
-#define GMOCK_KIND_OF_(type) \
+#define GMOCK_KIND_OF_(type)                   \
   static_cast< ::testing::internal::TypeKind>( \
       ::testing::internal::KindOf<type>::value)
 
@@ -193,9 +208,7 @@ using LosslessArithmeticConvertible =
 class FailureReporterInterface {
  public:
   // The type of a failure (either non-fatal or fatal).
-  enum FailureType {
-    kNonfatal, kFatal
-  };
+  enum FailureType { kNonfatal, kFatal };
 
   virtual ~FailureReporterInterface() {}
 
@@ -215,8 +228,8 @@ GTEST_API_ FailureReporterInterface* GetFailureReporter();
 inline void Assert(bool condition, const char* file, int line,
                    const std::string& msg) {
   if (!condition) {
-    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal,
-                                        file, line, msg);
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal, file,
+                                        line, msg);
   }
 }
 inline void Assert(bool condition, const char* file, int line) {
@@ -237,10 +250,7 @@ inline void Expect(bool condition, const char* file, int line) {
 }
 
 // Severity level of a log.
-enum LogSeverity {
-  kInfo = 0,
-  kWarning = 1
-};
+enum LogSeverity { kInfo = 0, kWarning = 1 };
 
 // Valid values for the --gmock_verbose flag.
 
@@ -281,10 +291,10 @@ class WithoutMatchers {
 GTEST_API_ WithoutMatchers GetWithoutMatchers();
 
 // Disable MSVC warnings for infinite recursion, since in this case the
-// the recursion is unreachable.
+// recursion is unreachable.
 #ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4717)
+#pragma warning(push)
+#pragma warning(disable : 4717)
 #endif
 
 // Invalid<T>() is usable as an expression of type T, but will terminate
@@ -295,14 +305,17 @@ GTEST_API_ WithoutMatchers GetWithoutMatchers();
 template <typename T>
 inline T Invalid() {
   Assert(false, "", -1, "Internal error: attempt to return invalid value");
-  // This statement is unreachable, and would never terminate even if it
-  // could be reached. It is provided only to placate compiler warnings
-  // about missing return statements.
+#if defined(__GNUC__) || defined(__clang__)
+  __builtin_unreachable();
+#elif defined(_MSC_VER)
+  __assume(0);
+#else
   return Invalid<T>();
+#endif
 }
 
 #ifdef _MSC_VER
-# pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 // Given a raw type (i.e. having no top-level reference or const
@@ -381,7 +394,8 @@ class StlContainerView< ::std::tuple<ElementPointer, Size> > {
 
 // The following specialization prevents the user from instantiating
 // StlContainer with a reference type.
-template <typename T> class StlContainerView<T&>;
+template <typename T>
+class StlContainerView<T&>;
 
 // A type transform to remove constness from the first part of a pair.
 // Pairs like that are used as the value_type of associative containers,
@@ -402,17 +416,18 @@ struct RemoveConstFromKey<std::pair<const K, V> > {
 GTEST_API_ void IllegalDoDefault(const char* file, int line);
 
 template <typename F, typename Tuple, size_t... Idx>
-auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>) -> decltype(
-    std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...)) {
+auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>)
+    -> decltype(std::forward<F>(f)(
+        std::get<Idx>(std::forward<Tuple>(args))...)) {
   return std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...);
 }
 
 // Apply the function to a tuple of arguments.
 template <typename F, typename Tuple>
-auto Apply(F&& f, Tuple&& args) -> decltype(
-    ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
-              MakeIndexSequence<std::tuple_size<
-                  typename std::remove_reference<Tuple>::type>::value>())) {
+auto Apply(F&& f, Tuple&& args) -> decltype(ApplyImpl(
+    std::forward<F>(f), std::forward<Tuple>(args),
+    MakeIndexSequence<std::tuple_size<
+        typename std::remove_reference<Tuple>::type>::value>())) {
   return ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
                    MakeIndexSequence<std::tuple_size<
                        typename std::remove_reference<Tuple>::type>::value>());
@@ -449,8 +464,10 @@ struct Function<R(Args...)> {
 template <typename R, typename... Args>
 constexpr size_t Function<R(Args...)>::ArgumentCount;
 
+bool Base64Unescape(const std::string& encoded, std::string* decoded);
+
 #ifdef _MSC_VER
-# pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 }  // namespace internal
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
index 367a44d36..bc18a25f3 100644
--- a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
+++ b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // Low-level types and utilities for porting Google Mock to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -35,7 +34,8 @@
 // end with _ are part of Google Mock's public API and can be used by
 // code outside Google Mock.
 
-// GOOGLETEST_CM0002 DO NOT DELETE
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
 #define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
@@ -53,35 +53,87 @@
 // here, as Google Mock depends on Google Test.  Only add a utility
 // here if it's truly specific to Google Mock.
 
-#include "gtest/internal/gtest-port.h"
 #include "gmock/internal/custom/gmock-port.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#endif
 
 // For MS Visual C++, check the compiler version. At least VS 2015 is
 // required to compile Google Mock.
 #if defined(_MSC_VER) && _MSC_VER < 1900
-# error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
+#error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
 #endif
 
 // Macro for referencing flags.  This is public as we want the user to
 // use this syntax to reference Google Mock flags.
+#define GMOCK_FLAG_NAME_(name) gmock_##name
 #define GMOCK_FLAG(name) FLAGS_gmock_##name
 
-#if !defined(GMOCK_DECLARE_bool_)
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GMOCK_DEFINE_bool_(name, default_val, doc) \
+  ABSL_FLAG(bool, GMOCK_FLAG_NAME_(name), default_val, doc)
+#define GMOCK_DEFINE_int32_(name, default_val, doc) \
+  ABSL_FLAG(int32_t, GMOCK_FLAG_NAME_(name), default_val, doc)
+#define GMOCK_DEFINE_string_(name, default_val, doc) \
+  ABSL_FLAG(std::string, GMOCK_FLAG_NAME_(name), default_val, doc)
 
 // Macros for declaring flags.
-# define GMOCK_DECLARE_bool_(name) extern GTEST_API_ bool GMOCK_FLAG(name)
-# define GMOCK_DECLARE_int32_(name) extern GTEST_API_ int32_t GMOCK_FLAG(name)
-# define GMOCK_DECLARE_string_(name) \
-    extern GTEST_API_ ::std::string GMOCK_FLAG(name)
+#define GMOCK_DECLARE_bool_(name) \
+  ABSL_DECLARE_FLAG(bool, GMOCK_FLAG_NAME_(name))
+#define GMOCK_DECLARE_int32_(name) \
+  ABSL_DECLARE_FLAG(int32_t, GMOCK_FLAG_NAME_(name))
+#define GMOCK_DECLARE_string_(name) \
+  ABSL_DECLARE_FLAG(std::string, GMOCK_FLAG_NAME_(name))
+
+#define GMOCK_FLAG_GET(name) ::absl::GetFlag(GMOCK_FLAG(name))
+#define GMOCK_FLAG_SET(name, value) \
+  (void)(::absl::SetFlag(&GMOCK_FLAG(name), value))
+
+#else  // GTEST_HAS_ABSL
 
 // Macros for defining flags.
-# define GMOCK_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GMOCK_FLAG(name) = (default_val)
-# define GMOCK_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ int32_t GMOCK_FLAG(name) = (default_val)
-# define GMOCK_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val)
-
-#endif  // !defined(GMOCK_DECLARE_bool_)
+#define GMOCK_DEFINE_bool_(name, default_val, doc)  \
+  namespace testing {                               \
+  GTEST_API_ bool GMOCK_FLAG(name) = (default_val); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DEFINE_int32_(name, default_val, doc)    \
+  namespace testing {                                  \
+  GTEST_API_ int32_t GMOCK_FLAG(name) = (default_val); \
+  }                                                    \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DEFINE_string_(name, default_val, doc)         \
+  namespace testing {                                        \
+  GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val); \
+  }                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
+
+// Macros for declaring flags.
+#define GMOCK_DECLARE_bool_(name)          \
+  namespace testing {                      \
+  GTEST_API_ extern bool GMOCK_FLAG(name); \
+  }                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DECLARE_int32_(name)            \
+  namespace testing {                         \
+  GTEST_API_ extern int32_t GMOCK_FLAG(name); \
+  }                                           \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DECLARE_string_(name)                 \
+  namespace testing {                               \
+  GTEST_API_ extern ::std::string GMOCK_FLAG(name); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+
+#define GMOCK_FLAG_GET(name) ::testing::GMOCK_FLAG(name)
+#define GMOCK_FLAG_SET(name, value) (void)(::testing::GMOCK_FLAG(name) = value)
+
+#endif  // GTEST_HAS_ABSL
 
 #endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc b/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
index 7463f4383..92cde3484 100644
--- a/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
+++ b/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements cardinalities.
@@ -35,9 +34,11 @@
 #include "gmock/gmock-cardinalities.h"
 
 #include <limits.h>
+
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
+
 #include "gmock/internal/gmock-internal-utils.h"
 #include "gtest/gtest.h"
 
@@ -49,8 +50,7 @@ namespace {
 class BetweenCardinalityImpl : public CardinalityInterface {
  public:
   BetweenCardinalityImpl(int min, int max)
-      : min_(min >= 0 ? min : 0),
-        max_(max >= min_ ? max : min_) {
+      : min_(min >= 0 ? min : 0), max_(max >= min_ ? max : min_) {
     std::stringstream ss;
     if (min < 0) {
       ss << "The invocation lower bound must be >= 0, "
@@ -62,8 +62,7 @@ class BetweenCardinalityImpl : public CardinalityInterface {
       internal::Expect(false, __FILE__, __LINE__, ss.str());
     } else if (min > max) {
       ss << "The invocation upper bound (" << max
-         << ") must be >= the invocation lower bound (" << min
-         << ").";
+         << ") must be >= the invocation lower bound (" << min << ").";
       internal::Expect(false, __FILE__, __LINE__, ss.str());
     }
   }
@@ -87,7 +86,8 @@ class BetweenCardinalityImpl : public CardinalityInterface {
   const int min_;
   const int max_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(BetweenCardinalityImpl);
+  BetweenCardinalityImpl(const BetweenCardinalityImpl&) = delete;
+  BetweenCardinalityImpl& operator=(const BetweenCardinalityImpl&) = delete;
 };
 
 // Formats "n times" in a human-friendly way.
diff --git a/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc b/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
index e5b547981..0a74841f3 100644
--- a/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
+++ b/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file defines some utilities useful for implementing Google
@@ -37,8 +36,15 @@
 #include "gmock/internal/gmock-internal-utils.h"
 
 #include <ctype.h>
+
+#include <array>
+#include <cctype>
+#include <cstdint>
+#include <cstring>
 #include <ostream>  // NOLINT
 #include <string>
+#include <vector>
+
 #include "gmock/gmock.h"
 #include "gmock/internal/gmock-port.h"
 #include "gtest/gtest.h"
@@ -48,21 +54,22 @@ namespace internal {
 
 // Joins a vector of strings as if they are fields of a tuple; returns
 // the joined string.
-GTEST_API_ std::string JoinAsTuple(const Strings& fields) {
-  switch (fields.size()) {
-    case 0:
-      return "";
-    case 1:
-      return fields[0];
-    default:
-      std::string result = "(" + fields[0];
-      for (size_t i = 1; i < fields.size(); i++) {
-        result += ", ";
-        result += fields[i];
-      }
-      result += ")";
-      return result;
+GTEST_API_ std::string JoinAsKeyValueTuple(
+    const std::vector<const char*>& names, const Strings& values) {
+  GTEST_CHECK_(names.size() == values.size());
+  if (values.empty()) {
+    return "";
   }
+  const auto build_one = [&](const size_t i) {
+    return std::string(names[i]) + ": " + values[i];
+  };
+  std::string result = "(" + build_one(0);
+  for (size_t i = 1; i < values.size(); i++) {
+    result += ", ";
+    result += build_one(i);
+  }
+  result += ")";
+  return result;
 }
 
 // Converts an identifier name to a space-separated list of lower-case
@@ -76,12 +83,11 @@ GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name) {
     // We don't care about the current locale as the input is
     // guaranteed to be a valid C++ identifier name.
     const bool starts_new_word = IsUpper(*p) ||
-        (!IsAlpha(prev_char) && IsLower(*p)) ||
-        (!IsDigit(prev_char) && IsDigit(*p));
+                                 (!IsAlpha(prev_char) && IsLower(*p)) ||
+                                 (!IsDigit(prev_char) && IsDigit(*p));
 
     if (IsAlNum(*p)) {
-      if (starts_new_word && result != "")
-        result += ' ';
+      if (starts_new_word && result != "") result += ' ';
       result += ToLower(*p);
     }
   }
@@ -95,12 +101,9 @@ class GoogleTestFailureReporter : public FailureReporterInterface {
  public:
   void ReportFailure(FailureType type, const char* file, int line,
                      const std::string& message) override {
-    AssertHelper(type == kFatal ?
-                 TestPartResult::kFatalFailure :
-                 TestPartResult::kNonFatalFailure,
-                 file,
-                 line,
-                 message.c_str()) = Message();
+    AssertHelper(type == kFatal ? TestPartResult::kFatalFailure
+                                : TestPartResult::kNonFatalFailure,
+                 file, line, message.c_str()) = Message();
     if (type == kFatal) {
       posix::Abort();
     }
@@ -126,10 +129,10 @@ static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
 // Returns true if and only if a log with the given severity is visible
 // according to the --gmock_verbose flag.
 GTEST_API_ bool LogIsVisible(LogSeverity severity) {
-  if (GMOCK_FLAG(verbose) == kInfoVerbosity) {
+  if (GMOCK_FLAG_GET(verbose) == kInfoVerbosity) {
     // Always show the log if --gmock_verbose=info.
     return true;
-  } else if (GMOCK_FLAG(verbose) == kErrorVerbosity) {
+  } else if (GMOCK_FLAG_GET(verbose) == kErrorVerbosity) {
     // Always hide it if --gmock_verbose=error.
     return false;
   } else {
@@ -148,8 +151,7 @@ GTEST_API_ bool LogIsVisible(LogSeverity severity) {
 // conservative.
 GTEST_API_ void Log(LogSeverity severity, const std::string& message,
                     int stack_frames_to_skip) {
-  if (!LogIsVisible(severity))
-    return;
+  if (!LogIsVisible(severity)) return;
 
   // Ensures that logs from different threads don't interleave.
   MutexLock l(&g_log_mutex);
@@ -178,8 +180,8 @@ GTEST_API_ void Log(LogSeverity severity, const std::string& message,
       std::cout << "\n";
     }
     std::cout << "Stack trace:\n"
-         << ::testing::internal::GetCurrentOsStackTraceExceptTop(
-             ::testing::UnitTest::GetInstance(), actual_to_skip);
+              << ::testing::internal::GetCurrentOsStackTraceExceptTop(
+                     ::testing::UnitTest::GetInstance(), actual_to_skip);
   }
   std::cout << ::std::flush;
 }
@@ -196,5 +198,53 @@ GTEST_API_ void IllegalDoDefault(const char* file, int line) {
       "the variable in various places.");
 }
 
+constexpr char UnBase64Impl(char c, const char* const base64, char carry) {
+  return *base64 == 0   ? static_cast<char>(65)
+         : *base64 == c ? carry
+                        : UnBase64Impl(c, base64 + 1, carry + 1);
+}
+
+template <size_t... I>
+constexpr std::array<char, 256> UnBase64Impl(IndexSequence<I...>,
+                                             const char* const base64) {
+  return {{UnBase64Impl(static_cast<char>(I), base64, 0)...}};
+}
+
+constexpr std::array<char, 256> UnBase64(const char* const base64) {
+  return UnBase64Impl(MakeIndexSequence<256>{}, base64);
+}
+
+static constexpr char kBase64[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static constexpr std::array<char, 256> kUnBase64 = UnBase64(kBase64);
+
+bool Base64Unescape(const std::string& encoded, std::string* decoded) {
+  decoded->clear();
+  size_t encoded_len = encoded.size();
+  decoded->reserve(3 * (encoded_len / 4) + (encoded_len % 4));
+  int bit_pos = 0;
+  char dst = 0;
+  for (int src : encoded) {
+    if (std::isspace(src) || src == '=') {
+      continue;
+    }
+    char src_bin = kUnBase64[static_cast<size_t>(src)];
+    if (src_bin >= 64) {
+      decoded->clear();
+      return false;
+    }
+    if (bit_pos == 0) {
+      dst |= static_cast<char>(src_bin << 2);
+      bit_pos = 6;
+    } else {
+      dst |= static_cast<char>(src_bin >> (bit_pos - 2));
+      decoded->push_back(dst);
+      dst = static_cast<char>(src_bin << (10 - bit_pos));
+      bit_pos = (bit_pos + 6) % 8;
+    }
+  }
+  return true;
+}
+
 }  // namespace internal
 }  // namespace testing
diff --git a/third_party/googletest/src/googlemock/src/gmock-matchers.cc b/third_party/googletest/src/googlemock/src/gmock-matchers.cc
index dded437ad..a8d04a6da 100644
--- a/third_party/googletest/src/googlemock/src/gmock-matchers.cc
+++ b/third_party/googletest/src/googlemock/src/gmock-matchers.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements Matcher<const string&>, Matcher<string>, and
@@ -36,9 +35,11 @@
 #include "gmock/gmock-matchers.h"
 
 #include <string.h>
+
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <vector>
 
 namespace testing {
 namespace internal {
@@ -48,11 +49,13 @@ namespace internal {
 // 'negation' is false; otherwise returns the description of the
 // negation of the matcher.  'param_values' contains a list of strings
 // that are the print-out of the matcher's parameters.
-GTEST_API_ std::string FormatMatcherDescription(bool negation,
-                                                const char* matcher_name,
-                                                const Strings& param_values) {
+GTEST_API_ std::string FormatMatcherDescription(
+    bool negation, const char* matcher_name,
+    const std::vector<const char*>& param_names, const Strings& param_values) {
   std::string result = ConvertIdentifierNameToWords(matcher_name);
-  if (param_values.size() >= 1) result += " " + JoinAsTuple(param_values);
+  if (param_values.size() >= 1) {
+    result += " " + JoinAsKeyValueTuple(param_names, param_values);
+  }
   return negation ? "not (" + result + ")" : result;
 }
 
diff --git a/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc b/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
index c7266a370..658ad3fa2 100644
--- a/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
+++ b/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Mock - a framework for writing C++ mock classes.
 //
 // This file implements the spec builder syntax (ON_CALL and
@@ -42,6 +41,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "gmock/gmock.h"
@@ -49,15 +49,15 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
-# include <unistd.h>  // NOLINT
+#include <unistd.h>  // NOLINT
 #endif
 
 // Silence C4800 (C4800: 'int *const ': forcing value
 // to bool 'true' or 'false') for MSVC 15
 #ifdef _MSC_VER
 #if _MSC_VER == 1900
-#  pragma warning(push)
-#  pragma warning(disable:4800)
+#pragma warning(push)
+#pragma warning(disable : 4800)
 #endif
 #endif
 
@@ -195,11 +195,12 @@ void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
 
   // Describes the state of the expectation (e.g. is it satisfied?
   // is it active?).
-  *os << " - " << (IsOverSaturated() ? "over-saturated" :
-                   IsSaturated() ? "saturated" :
-                   IsSatisfied() ? "satisfied" : "unsatisfied")
-      << " and "
-      << (is_retired() ? "retired" : "active");
+  *os << " - "
+      << (IsOverSaturated() ? "over-saturated"
+          : IsSaturated()   ? "saturated"
+          : IsSatisfied()   ? "satisfied"
+                            : "unsatisfied")
+      << " and " << (is_retired() ? "retired" : "active");
 }
 
 // Checks the action count (i.e. the number of WillOnce() and
@@ -242,13 +243,12 @@ void ExpectationBase::CheckActionCountIfNotDone() const
 
     ::std::stringstream ss;
     DescribeLocationTo(&ss);
-    ss << "Too " << (too_many ? "many" : "few")
-       << " actions specified in " << source_text() << "...\n"
+    ss << "Too " << (too_many ? "many" : "few") << " actions specified in "
+       << source_text() << "...\n"
        << "Expected to be ";
     cardinality().DescribeTo(&ss);
-    ss << ", but has " << (too_many ? "" : "only ")
-       << action_count << " WillOnce()"
-       << (action_count == 1 ? "" : "s");
+    ss << ", but has " << (too_many ? "" : "only ") << action_count
+       << " WillOnce()" << (action_count == 1 ? "" : "s");
     if (repeated_action_specified_) {
       ss << " and a WillRepeatedly()";
     }
@@ -264,10 +264,10 @@ void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
                        ".Times() cannot appear "
                        "more than once in an EXPECT_CALL().");
   } else {
-    ExpectSpecProperty(last_clause_ < kTimes,
-                       ".Times() cannot appear after "
-                       ".InSequence(), .WillOnce(), .WillRepeatedly(), "
-                       "or .RetiresOnSaturation().");
+    ExpectSpecProperty(
+        last_clause_ < kTimes,
+        ".Times() may only appear *before* .InSequence(), .WillOnce(), "
+        ".WillRepeatedly(), or .RetiresOnSaturation(), not after.");
   }
   last_clause_ = kTimes;
 
@@ -283,7 +283,7 @@ GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
 void ReportUninterestingCall(CallReaction reaction, const std::string& msg) {
   // Include a stack trace only if --gmock_verbose=info is specified.
   const int stack_frames_to_skip =
-      GMOCK_FLAG(verbose) == kInfoVerbosity ? 3 : -1;
+      GMOCK_FLAG_GET(verbose) == kInfoVerbosity ? 3 : -1;
   switch (reaction) {
     case kAllow:
       Log(kInfo, msg, stack_frames_to_skip);
@@ -370,143 +370,12 @@ const char* UntypedFunctionMockerBase::Name() const
   return name;
 }
 
-// Calculates the result of invoking this mock function with the given
-// arguments, prints it, and returns it.  The caller is responsible
-// for deleting the result.
-UntypedActionResultHolderBase* UntypedFunctionMockerBase::UntypedInvokeWith(
-    void* const untyped_args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-  // See the definition of untyped_expectations_ for why access to it
-  // is unprotected here.
-  if (untyped_expectations_.size() == 0) {
-    // No expectation is set on this mock method - we have an
-    // uninteresting call.
-
-    // We must get Google Mock's reaction on uninteresting calls
-    // made on this mock object BEFORE performing the action,
-    // because the action may DELETE the mock object and make the
-    // following expression meaningless.
-    const CallReaction reaction =
-        Mock::GetReactionOnUninterestingCalls(MockObject());
-
-    // True if and only if we need to print this call's arguments and return
-    // value.  This definition must be kept in sync with
-    // the behavior of ReportUninterestingCall().
-    const bool need_to_report_uninteresting_call =
-        // If the user allows this uninteresting call, we print it
-        // only when they want informational messages.
-        reaction == kAllow ? LogIsVisible(kInfo) :
-                           // If the user wants this to be a warning, we print
-                           // it only when they want to see warnings.
-            reaction == kWarn
-                ? LogIsVisible(kWarning)
-                :
-                // Otherwise, the user wants this to be an error, and we
-                // should always print detailed information in the error.
-                true;
-
-    if (!need_to_report_uninteresting_call) {
-      // Perform the action without printing the call information.
-      return this->UntypedPerformDefaultAction(
-          untyped_args, "Function call: " + std::string(Name()));
-    }
-
-    // Warns about the uninteresting call.
-    ::std::stringstream ss;
-    this->UntypedDescribeUninterestingCall(untyped_args, &ss);
-
-    // Calculates the function result.
-    UntypedActionResultHolderBase* const result =
-        this->UntypedPerformDefaultAction(untyped_args, ss.str());
-
-    // Prints the function result.
-    if (result != nullptr) result->PrintAsActionResult(&ss);
-
-    ReportUninterestingCall(reaction, ss.str());
-    return result;
-  }
-
-  bool is_excessive = false;
-  ::std::stringstream ss;
-  ::std::stringstream why;
-  ::std::stringstream loc;
-  const void* untyped_action = nullptr;
-
-  // The UntypedFindMatchingExpectation() function acquires and
-  // releases g_gmock_mutex.
-
-  const ExpectationBase* const untyped_expectation =
-      this->UntypedFindMatchingExpectation(untyped_args, &untyped_action,
-                                           &is_excessive, &ss, &why);
-  const bool found = untyped_expectation != nullptr;
-
-  // True if and only if we need to print the call's arguments
-  // and return value.
-  // This definition must be kept in sync with the uses of Expect()
-  // and Log() in this function.
-  const bool need_to_report_call =
-      !found || is_excessive || LogIsVisible(kInfo);
-  if (!need_to_report_call) {
-    // Perform the action without printing the call information.
-    return untyped_action == nullptr
-               ? this->UntypedPerformDefaultAction(untyped_args, "")
-               : this->UntypedPerformAction(untyped_action, untyped_args);
-  }
-
-  ss << "    Function call: " << Name();
-  this->UntypedPrintArgs(untyped_args, &ss);
-
-  // In case the action deletes a piece of the expectation, we
-  // generate the message beforehand.
-  if (found && !is_excessive) {
-    untyped_expectation->DescribeLocationTo(&loc);
-  }
-
-  UntypedActionResultHolderBase* result = nullptr;
-
-  auto perform_action = [&] {
-    return untyped_action == nullptr
-               ? this->UntypedPerformDefaultAction(untyped_args, ss.str())
-               : this->UntypedPerformAction(untyped_action, untyped_args);
-  };
-  auto handle_failures = [&] {
-    ss << "\n" << why.str();
-
-    if (!found) {
-      // No expectation matches this call - reports a failure.
-      Expect(false, nullptr, -1, ss.str());
-    } else if (is_excessive) {
-      // We had an upper-bound violation and the failure message is in ss.
-      Expect(false, untyped_expectation->file(), untyped_expectation->line(),
-             ss.str());
-    } else {
-      // We had an expected call and the matching expectation is
-      // described in ss.
-      Log(kInfo, loc.str() + ss.str(), 2);
-    }
-  };
-#if GTEST_HAS_EXCEPTIONS
-  try {
-    result = perform_action();
-  } catch (...) {
-    handle_failures();
-    throw;
-  }
-#else
-  result = perform_action();
-#endif
-
-  if (result != nullptr) result->PrintAsActionResult(&ss);
-  handle_failures();
-  return result;
-}
-
 // Returns an Expectation object that references and co-owns exp,
 // which must be an expectation on this mock function.
 Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
   // See the definition of untyped_expectations_ for why access to it
   // is unprotected here.
-  for (UntypedExpectations::const_iterator it =
-           untyped_expectations_.begin();
+  for (UntypedExpectations::const_iterator it = untyped_expectations_.begin();
        it != untyped_expectations_.end(); ++it) {
     if (it->get() == exp) {
       return Expectation(*it);
@@ -526,8 +395,7 @@ bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
     GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
   g_gmock_mutex.AssertHeld();
   bool expectations_met = true;
-  for (UntypedExpectations::const_iterator it =
-           untyped_expectations_.begin();
+  for (UntypedExpectations::const_iterator it = untyped_expectations_.begin();
        it != untyped_expectations_.end(); ++it) {
     ExpectationBase* const untyped_expectation = it->get();
     if (untyped_expectation->IsOverSaturated()) {
@@ -538,15 +406,15 @@ bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
     } else if (!untyped_expectation->IsSatisfied()) {
       expectations_met = false;
       ::std::stringstream ss;
-      ss  << "Actual function call count doesn't match "
-          << untyped_expectation->source_text() << "...\n";
+      ss << "Actual function call count doesn't match "
+         << untyped_expectation->source_text() << "...\n";
       // No need to show the source file location of the expectation
       // in the description, as the Expect() call that follows already
       // takes care of it.
       untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
       untyped_expectation->DescribeCallCountTo(&ss);
-      Expect(false, untyped_expectation->file(),
-             untyped_expectation->line(), ss.str());
+      Expect(false, untyped_expectation->file(), untyped_expectation->line(),
+             ss.str());
     }
   }
 
@@ -613,8 +481,7 @@ class MockObjectRegistry {
   // object alive.  Therefore we report any living object as test
   // failure, unless the user explicitly asked us to ignore it.
   ~MockObjectRegistry() {
-    if (!GMOCK_FLAG(catch_leaked_mocks))
-      return;
+    if (!GMOCK_FLAG_GET(catch_leaked_mocks)) return;
 
     int leaked_count = 0;
     for (StateMap::const_iterator it = states_.begin(); it != states_.end();
@@ -634,7 +501,7 @@ class MockObjectRegistry {
                   << state.first_used_test << ")";
       }
       std::cout << " should be deleted but never is. Its address is @"
-           << it->first << ".";
+                << it->first << ".";
       leaked_count++;
     }
     if (leaked_count > 0) {
@@ -668,57 +535,63 @@ MockObjectRegistry g_mock_object_registry;
 
 // Maps a mock object to the reaction Google Mock should have when an
 // uninteresting method is called.  Protected by g_gmock_mutex.
-std::map<const void*, internal::CallReaction> g_uninteresting_call_reaction;
+std::unordered_map<uintptr_t, internal::CallReaction>&
+UninterestingCallReactionMap() {
+  static auto* map = new std::unordered_map<uintptr_t, internal::CallReaction>;
+  return *map;
+}
 
 // Sets the reaction Google Mock should have when an uninteresting
 // method of the given mock object is called.
-void SetReactionOnUninterestingCalls(const void* mock_obj,
+void SetReactionOnUninterestingCalls(uintptr_t mock_obj,
                                      internal::CallReaction reaction)
     GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
   internal::MutexLock l(&internal::g_gmock_mutex);
-  g_uninteresting_call_reaction[mock_obj] = reaction;
+  UninterestingCallReactionMap()[mock_obj] = reaction;
 }
 
 }  // namespace
 
 // Tells Google Mock to allow uninteresting calls on the given mock
 // object.
-void Mock::AllowUninterestingCalls(const void* mock_obj)
+void Mock::AllowUninterestingCalls(uintptr_t mock_obj)
     GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
   SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
 }
 
 // Tells Google Mock to warn the user about uninteresting calls on the
 // given mock object.
-void Mock::WarnUninterestingCalls(const void* mock_obj)
+void Mock::WarnUninterestingCalls(uintptr_t mock_obj)
     GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
   SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
 }
 
 // Tells Google Mock to fail uninteresting calls on the given mock
 // object.
-void Mock::FailUninterestingCalls(const void* mock_obj)
+void Mock::FailUninterestingCalls(uintptr_t mock_obj)
     GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
   SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
 }
 
 // Tells Google Mock the given mock object is being destroyed and its
 // entry in the call-reaction table should be removed.
-void Mock::UnregisterCallReaction(const void* mock_obj)
+void Mock::UnregisterCallReaction(uintptr_t mock_obj)
     GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
   internal::MutexLock l(&internal::g_gmock_mutex);
-  g_uninteresting_call_reaction.erase(mock_obj);
+  UninterestingCallReactionMap().erase(static_cast<uintptr_t>(mock_obj));
 }
 
 // Returns the reaction Google Mock will have on uninteresting calls
 // made on the given mock object.
 internal::CallReaction Mock::GetReactionOnUninterestingCalls(
-    const void* mock_obj)
-        GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+    const void* mock_obj) GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
   internal::MutexLock l(&internal::g_gmock_mutex);
-  return (g_uninteresting_call_reaction.count(mock_obj) == 0) ?
-      internal::intToCallReaction(GMOCK_FLAG(default_mock_behavior)) :
-      g_uninteresting_call_reaction[mock_obj];
+  return (UninterestingCallReactionMap().count(
+              reinterpret_cast<uintptr_t>(mock_obj)) == 0)
+             ? internal::intToCallReaction(
+                   GMOCK_FLAG_GET(default_mock_behavior))
+             : UninterestingCallReactionMap()[reinterpret_cast<uintptr_t>(
+                   mock_obj)];
 }
 
 // Tells Google Mock to ignore mock_obj when checking for leaked mock
@@ -873,8 +746,8 @@ Expectation::~Expectation() {}
 void Sequence::AddExpectation(const Expectation& expectation) const {
   if (*last_expectation_ != expectation) {
     if (last_expectation_->expectation_base() != nullptr) {
-      expectation.expectation_base()->immediate_prerequisites_
-          += *last_expectation_;
+      expectation.expectation_base()->immediate_prerequisites_ +=
+          *last_expectation_;
     }
     *last_expectation_ = expectation;
   }
@@ -903,6 +776,6 @@ InSequence::~InSequence() {
 
 #ifdef _MSC_VER
 #if _MSC_VER == 1900
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
 #endif
diff --git a/third_party/googletest/src/googlemock/src/gmock.cc b/third_party/googletest/src/googlemock/src/gmock.cc
index 7bcdb0ba2..5025656a0 100644
--- a/third_party/googletest/src/googlemock/src/gmock.cc
+++ b/third_party/googletest/src/googlemock/src/gmock.cc
@@ -27,17 +27,15 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gmock/gmock.h"
-#include "gmock/internal/gmock-port.h"
 
-namespace testing {
+#include "gmock/internal/gmock-port.h"
 
 GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
                    "true if and only if Google Mock should report leaked "
                    "mock objects as failures.");
 
-GMOCK_DEFINE_string_(verbose, internal::kWarningVerbosity,
+GMOCK_DEFINE_string_(verbose, testing::internal::kWarningVerbosity,
                      "Controls how verbose Google Mock's output is."
                      "  Valid values:\n"
                      "  info    - prints all messages.\n"
@@ -51,6 +49,7 @@ GMOCK_DEFINE_int32_(default_mock_behavior, 1,
                     "  1 - by default, mocks act as NaggyMocks.\n"
                     "  2 - by default, mocks act as StrictMocks.");
 
+namespace testing {
 namespace internal {
 
 // Parses a string as a command line flag.  The string should have the
@@ -59,18 +58,18 @@ namespace internal {
 //
 // Returns the value of the flag, or NULL if the parsing failed.
 static const char* ParseGoogleMockFlagValue(const char* str,
-                                            const char* flag,
+                                            const char* flag_name,
                                             bool def_optional) {
   // str and flag must not be NULL.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag_name == nullptr) return nullptr;
 
   // The flag must start with "--gmock_".
-  const std::string flag_str = std::string("--gmock_") + flag;
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+  const std::string flag_name_str = std::string("--gmock_") + flag_name;
+  const size_t flag_name_len = flag_name_str.length();
+  if (strncmp(str, flag_name_str.c_str(), flag_name_len) != 0) return nullptr;
 
   // Skips the flag name.
-  const char* flag_end = str + flag_len;
+  const char* flag_end = str + flag_name_len;
 
   // When def_optional is true, it's OK to not have a "=value" part.
   if (def_optional && (flag_end[0] == '\0')) {
@@ -91,10 +90,10 @@ static const char* ParseGoogleMockFlagValue(const char* str,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-static bool ParseGoogleMockBoolFlag(const char* str, const char* flag,
-                                    bool* value) {
+static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
+                                bool* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -110,10 +109,10 @@ static bool ParseGoogleMockBoolFlag(const char* str, const char* flag,
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 template <typename String>
-static bool ParseGoogleMockStringFlag(const char* str, const char* flag,
-                                      String* value) {
+static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
+                                String* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseGoogleMockFlagValue(str, flag, false);
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -123,17 +122,17 @@ static bool ParseGoogleMockStringFlag(const char* str, const char* flag,
   return true;
 }
 
-static bool ParseGoogleMockIntFlag(const char* str, const char* flag,
-                                   int32_t* value) {
+static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
+                                int32_t* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
+  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+                    value);
 }
 
 // The internal implementation of InitGoogleMock().
@@ -152,11 +151,22 @@ void InitGoogleMockImpl(int* argc, CharType** argv) {
     const char* const arg = arg_string.c_str();
 
     // Do we see a Google Mock flag?
-    if (ParseGoogleMockBoolFlag(arg, "catch_leaked_mocks",
-                                &GMOCK_FLAG(catch_leaked_mocks)) ||
-        ParseGoogleMockStringFlag(arg, "verbose", &GMOCK_FLAG(verbose)) ||
-        ParseGoogleMockIntFlag(arg, "default_mock_behavior",
-                               &GMOCK_FLAG(default_mock_behavior))) {
+    bool found_gmock_flag = false;
+
+#define GMOCK_INTERNAL_PARSE_FLAG(flag_name)            \
+  if (!found_gmock_flag) {                              \
+    auto value = GMOCK_FLAG_GET(flag_name);             \
+    if (ParseGoogleMockFlag(arg, #flag_name, &value)) { \
+      GMOCK_FLAG_SET(flag_name, value);                 \
+      found_gmock_flag = true;                          \
+    }                                                   \
+  }
+
+    GMOCK_INTERNAL_PARSE_FLAG(catch_leaked_mocks)
+    GMOCK_INTERNAL_PARSE_FLAG(verbose)
+    GMOCK_INTERNAL_PARSE_FLAG(default_mock_behavior)
+
+    if (found_gmock_flag) {
       // Yes.  Shift the remainder of the argv list left by one.  Note
       // that argv has (*argc + 1) elements, the last one always being
       // NULL.  The following loop moves the trailing NULL element as
diff --git a/third_party/googletest/src/googlemock/src/gmock_main.cc b/third_party/googletest/src/googlemock/src/gmock_main.cc
index 18c500f66..b411c5ecb 100644
--- a/third_party/googletest/src/googlemock/src/gmock_main.cc
+++ b/third_party/googletest/src/googlemock/src/gmock_main.cc
@@ -27,8 +27,8 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include <iostream>
+
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -56,7 +56,7 @@ void loop() { RUN_ALL_TESTS(); }
 // https://web.archive.org/web/20170912203238/connect.microsoft.com/VisualStudio/feedback/details/394464/wmain-link-error-in-the-static-library
 // // NOLINT
 #if GTEST_OS_WINDOWS_MOBILE
-# include <tchar.h>  // NOLINT
+#include <tchar.h>  // NOLINT
 
 GTEST_API_ int _tmain(int argc, TCHAR** argv) {
 #else
diff --git a/third_party/googletest/src/googletest/CMakeLists.txt b/third_party/googletest/src/googletest/CMakeLists.txt
index abdd98b79..aa00a5f3d 100644
--- a/third_party/googletest/src/googletest/CMakeLists.txt
+++ b/third_party/googletest/src/googletest/CMakeLists.txt
@@ -46,14 +46,9 @@ endif()
 
 # Project version:
 
-if (CMAKE_VERSION VERSION_LESS 3.0)
-  project(gtest CXX C)
-  set(PROJECT_VERSION ${GOOGLETEST_VERSION})
-else()
-  cmake_policy(SET CMP0048 NEW)
-  project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
-endif()
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.5)
+cmake_policy(SET CMP0048 NEW)
+project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
 
 if (POLICY CMP0063) # Visibility
   cmake_policy(SET CMP0063 NEW)
@@ -136,13 +131,17 @@ set_target_properties(gtest_main PROPERTIES VERSION ${GOOGLETEST_VERSION})
 # to the targets for when we are part of a parent build (ie being pulled
 # in via add_subdirectory() rather than being a standalone build).
 if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  string(REPLACE ";" "$<SEMICOLON>" dirs "${gtest_build_include_dirs}")
   target_include_directories(gtest SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<BUILD_INTERFACE:${dirs}>"
     "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
   target_include_directories(gtest_main SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<BUILD_INTERFACE:${dirs}>"
     "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
 endif()
+if(CMAKE_SYSTEM_NAME MATCHES "QNX")
+  target_link_libraries(gtest PUBLIC regex)
+endif()
 target_link_libraries(gtest_main PUBLIC gtest)
 
 ########################################################################
diff --git a/third_party/googletest/src/googletest/README.md b/third_party/googletest/src/googletest/README.md
index 1f8b349ae..d26b309ed 100644
--- a/third_party/googletest/src/googletest/README.md
+++ b/third_party/googletest/src/googletest/README.md
@@ -25,7 +25,7 @@ When building GoogleTest as a standalone project, the typical workflow starts
 with
 
 ```
-git clone https://github.com/google/googletest.git -b release-1.10.0
+git clone https://github.com/google/googletest.git -b release-1.11.0
 cd googletest        # Main directory of the cloned repository.
 mkdir build          # Create a directory to hold the build output.
 cd build
@@ -94,7 +94,7 @@ include(FetchContent)
 FetchContent_Declare(
   googletest
   # Specify the commit you depend on and update it regularly.
-  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+  URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -203,7 +203,9 @@ add
     -DGTEST_DONT_DEFINE_FOO=1
 
 to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
-to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
     GTEST_TEST(SomeTest, DoesThis) { ... }
diff --git a/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/third_party/googletest/src/googletest/cmake/internal_utils.cmake
index 8d8d60a86..5a34c07a1 100644
--- a/third_party/googletest/src/googletest/cmake/internal_utils.cmake
+++ b/third_party/googletest/src/googletest/cmake/internal_utils.cmake
@@ -84,13 +84,13 @@ macro(config_compiler_and_linker)
     # Ensure MSVC treats source files as UTF-8 encoded.
     set(cxx_base_flags "${cxx_base_flags} -utf-8")
   elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    set(cxx_base_flags "-Wall -Wshadow -Werror -Wconversion")
+    set(cxx_base_flags "-Wall -Wshadow -Wconversion")
     set(cxx_exception_flags "-fexceptions")
     set(cxx_no_exception_flags "-fno-exceptions")
     set(cxx_strict_flags "-W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls")
     set(cxx_no_rtti_flags "-fno-rtti")
   elseif (CMAKE_COMPILER_IS_GNUCXX)
-    set(cxx_base_flags "-Wall -Wshadow -Werror")
+    set(cxx_base_flags "-Wall -Wshadow")
     if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
       set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else")
     endif()
@@ -154,10 +154,6 @@ function(cxx_library_with_type name type cxx_flags)
   set_target_properties(${name}
     PROPERTIES
     COMPILE_FLAGS "${cxx_flags}")
-  # Generate debug library name with a postfix.
-  set_target_properties(${name}
-    PROPERTIES
-    DEBUG_POSTFIX "d")
   # Set the output directory for build artifacts
   set_target_properties(${name}
     PROPERTIES
@@ -304,6 +300,8 @@ function(py_test name)
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
           --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
     endif()
+    # Make the Python import path consistent between Bazel and CMake.
+    set_tests_properties(${name} PROPERTIES ENVIRONMENT PYTHONPATH=${CMAKE_SOURCE_DIR})
   endif(PYTHONINTERP_FOUND)
 endfunction()
 
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h b/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h
new file mode 100644
index 000000000..addbb59c6
--- /dev/null
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251                                   \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type*
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if and only if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T>
+  AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
index 9b4d4d133..84e5a5bbd 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
@@ -27,21 +27,21 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
 #include "gtest/internal/gtest-death-test-internal.h"
 
-namespace testing {
-
 // This flag controls the style of death tests.  Valid values are "threadsafe",
 // meaning that the death test child process will re-execute the test binary
 // from the start, running only a single death test, or "fast",
@@ -49,6 +49,8 @@ namespace testing {
 // after forking.
 GTEST_DECLARE_string_(death_test_style);
 
+namespace testing {
+
 #if GTEST_HAS_DEATH_TEST
 
 namespace internal {
@@ -103,7 +105,6 @@ GTEST_API_ bool InDeathTestChild();
 //
 // On the regular expressions used in death tests:
 //
-//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
@@ -169,24 +170,24 @@ GTEST_API_ bool InDeathTestChild();
 // Asserts that a given `statement` causes the program to exit, with an
 // integer exit status that satisfies `predicate`, and emitting error output
 // that matches `matcher`.
-# define ASSERT_EXIT(statement, predicate, matcher) \
-    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
+#define ASSERT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
 // Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_EXIT(statement, predicate, matcher) \
-    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
 // Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
 // signal, and emitting error output that matches `matcher`.
-# define ASSERT_DEATH(statement, matcher) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define ASSERT_DEATH(statement, matcher) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_DEATH(statement, matcher) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define EXPECT_DEATH(statement, matcher) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -197,22 +198,23 @@ class GTEST_API_ ExitedWithCode {
   ExitedWithCode(const ExitedWithCode&) = default;
   void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
+
  private:
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
-// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
   bool operator()(int exit_status) const;
+
  private:
   const int signum_;
 };
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
 // The death testing framework causes this to have interesting semantics,
@@ -257,23 +259,21 @@ class GTEST_API_ KilledBySignal {
 //   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
 // }, "death");
 //
-# ifdef NDEBUG
+#ifdef NDEBUG
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
+#define EXPECT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
+#define ASSERT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-# else
+#else
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
 
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
 // This macro is used for implementing macros such as
@@ -311,18 +311,17 @@ class GTEST_API_ KilledBySignal {
 //  statement unconditionally returns or throws. The Message constructor at
 //  the end allows the syntax of streaming additional messages into the
 //  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
 
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
@@ -330,15 +329,15 @@ class GTEST_API_ KilledBySignal {
 // useful when you are combining death test assertions with normal test
 // assertions in one test.
 #if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
 #else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h b/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
index 9fa34a05b..bffa00c53 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
@@ -32,6 +32,10 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
@@ -98,11 +102,11 @@ class MatchResultListener {
  private:
   ::std::ostream* const stream_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+  MatchResultListener(const MatchResultListener&) = delete;
+  MatchResultListener& operator=(const MatchResultListener&) = delete;
 };
 
-inline MatchResultListener::~MatchResultListener() {
-}
+inline MatchResultListener::~MatchResultListener() {}
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
@@ -176,27 +180,39 @@ namespace internal {
 
 struct AnyEq {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a == b; }
+  bool operator()(const A& a, const B& b) const {
+    return a == b;
+  }
 };
 struct AnyNe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a != b; }
+  bool operator()(const A& a, const B& b) const {
+    return a != b;
+  }
 };
 struct AnyLt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a < b; }
+  bool operator()(const A& a, const B& b) const {
+    return a < b;
+  }
 };
 struct AnyGt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a > b; }
+  bool operator()(const A& a, const B& b) const {
+    return a > b;
+  }
 };
 struct AnyLe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a <= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a <= b;
+  }
 };
 struct AnyGe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a >= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a >= b;
+  }
 };
 
 // A match result listener that ignores the explanation.
@@ -205,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener {
   DummyMatchResultListener() : MatchResultListener(nullptr) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+  DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+  DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
 };
 
 // A match result listener that forwards the explanation to a given
@@ -217,7 +234,9 @@ class StreamMatchResultListener : public MatchResultListener {
       : MatchResultListener(os) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+  StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+  StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+      delete;
 };
 
 struct SharedPayloadBase {
@@ -284,17 +303,18 @@ class MatcherBase : private MatcherDescriberInterface {
   }
 
  protected:
-  MatcherBase() : vtable_(nullptr) {}
+  MatcherBase() : vtable_(nullptr), buffer_() {}
 
   // Constructs a matcher from its implementation.
   template <typename U>
-  explicit MatcherBase(const MatcherInterface<U>* impl) {
+  explicit MatcherBase(const MatcherInterface<U>* impl)
+      : vtable_(nullptr), buffer_() {
     Init(impl);
   }
 
   template <typename M, typename = typename std::remove_reference<
                             M>::type::is_gtest_matcher>
-  MatcherBase(M&& m) {  // NOLINT
+  MatcherBase(M&& m) : vtable_(nullptr), buffer_() {  // NOLINT
     Init(std::forward<M>(m));
   }
 
@@ -420,8 +440,8 @@ class MatcherBase : private MatcherDescriberInterface {
     static const M& Get(const MatcherBase& m) {
       // When inlined along with Init, need to be explicit to avoid violating
       // strict aliasing rules.
-      const M *ptr = static_cast<const M*>(
-          static_cast<const void*>(&m.buffer_));
+      const M* ptr =
+          static_cast<const M*>(static_cast<const void*>(&m.buffer_));
       return *ptr;
     }
     static void Init(MatcherBase& m, M impl) {
@@ -741,7 +761,7 @@ template <typename Rhs>
 class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
  public:
   explicit EqMatcher(const Rhs& rhs)
-      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
   static const char* Desc() { return "is equal to"; }
   static const char* NegatedDesc() { return "isn't equal to"; }
 };
@@ -749,7 +769,7 @@ template <typename Rhs>
 class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
  public:
   explicit NeMatcher(const Rhs& rhs)
-      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
   static const char* Desc() { return "isn't equal to"; }
   static const char* NegatedDesc() { return "is equal to"; }
 };
@@ -757,7 +777,7 @@ template <typename Rhs>
 class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
  public:
   explicit LtMatcher(const Rhs& rhs)
-      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
   static const char* Desc() { return "is <"; }
   static const char* NegatedDesc() { return "isn't <"; }
 };
@@ -765,7 +785,7 @@ template <typename Rhs>
 class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
  public:
   explicit GtMatcher(const Rhs& rhs)
-      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
   static const char* Desc() { return "is >"; }
   static const char* NegatedDesc() { return "isn't >"; }
 };
@@ -773,7 +793,7 @@ template <typename Rhs>
 class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
  public:
   explicit LeMatcher(const Rhs& rhs)
-      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
   static const char* Desc() { return "is <="; }
   static const char* NegatedDesc() { return "isn't <="; }
 };
@@ -781,7 +801,7 @@ template <typename Rhs>
 class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
  public:
   explicit GeMatcher(const Rhs& rhs)
-      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
   static const char* Desc() { return "is >="; }
   static const char* NegatedDesc() { return "isn't >="; }
 };
@@ -872,12 +892,16 @@ PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
 // Note: if the parameter of Eq() were declared as const T&, Eq("foo")
 // wouldn't compile.
 template <typename T>
-inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
 
 // Constructs a Matcher<T> from a 'value' of type T.  The constructed
 // matcher matches any value that's equal to 'value'.
 template <typename T>
-Matcher<T>::Matcher(T value) { *this = Eq(value); }
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
 
 // Creates a monomorphic matcher that matches anything with type Lhs
 // and equal to rhs.  A user may need to use this instead of Eq(...)
@@ -892,7 +916,9 @@ Matcher<T>::Matcher(T value) { *this = Eq(value); }
 // can always write Matcher<T>(Lt(5)) to be explicit about the type,
 // for example.
 template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+  return Eq(rhs);
+}
 
 // Creates a polymorphic matcher that matches anything >= x.
 template <typename Rhs>
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/third_party/googletest/src/googletest/include/gtest/gtest-message.h
index becfd49fc..6c8bf9000 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-message.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-message.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
@@ -42,7 +41,9 @@
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
@@ -110,8 +111,8 @@ class GTEST_API_ Message {
 
   // Streams a non-pointer value to this object.
   template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
+  inline Message& operator<<(const T& val) {
+        // Some libraries overload << for STL containers.  These
     // overloads are defined in the global namespace instead of ::std.
     //
     // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
@@ -125,7 +126,7 @@ class GTEST_API_ Message {
     // from the global namespace.  With this using declaration,
     // overloads of << defined in the global namespace and those
     // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
+    using ::operator<<;
     *ss_ << val;
     return *this;
   }
@@ -144,7 +145,7 @@ class GTEST_API_ Message {
   // ensure consistent result across compilers, we always treat NULL
   // as "(null)".
   template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
+  inline Message& operator<<(T* const& pointer) {  // NOLINT
     if (pointer == nullptr) {
       *ss_ << "(null)";
     } else {
@@ -159,25 +160,23 @@ class GTEST_API_ Message {
   // templatized version above.  Without this definition, streaming
   // endl or other basic IO manipulators to Message will confuse the
   // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
+  Message& operator<<(BasicNarrowIoManip val) {
     *ss_ << val;
     return *this;
   }
 
   // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
+  Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
+  Message& operator<<(const wchar_t* wide_c_str);
+  Message& operator<<(wchar_t* wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
   // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
+  Message& operator<<(const ::std::wstring& wstr);
 #endif  // GTEST_HAS_STD_WSTRING
 
   // Gets the text streamed to this object so far as an std::string.
@@ -196,7 +195,7 @@ class GTEST_API_ Message {
 };
 
 // Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
   return os << sb.GetString();
 }
 
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
index 804e70281..b55119ac6 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
@@ -26,11 +26,14 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -353,9 +356,7 @@ internal::ValueArray<T...> Values(T... v) {
 // }
 // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
 //
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 
 // Combine() allows the user to combine two or more sequences to produce
 // values of a Cartesian product of those sequences' elements.
@@ -428,8 +429,11 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
-                                                           test_name));        \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
   };                                                                           \
   int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
                              test_name)::gtest_registering_dummy_ =            \
@@ -453,43 +457,42 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
 #define GTEST_GET_FIRST_(first, ...) first
 #define GTEST_GET_SECOND_(first, second, ...) second
 
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
-  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
-      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
-    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
-  }                                                                           \
-  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
-      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
-    if (::testing::internal::AlwaysFalse()) {                                 \
-      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
-          __VA_ARGS__,                                                        \
-          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
-          DUMMY_PARAM_)));                                                    \
-      auto t = std::make_tuple(__VA_ARGS__);                                  \
-      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
-                    "Too Many Args!");                                        \
-    }                                                                         \
-    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
-        __VA_ARGS__,                                                          \
-        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
-        DUMMY_PARAM_))))(info);                                               \
-  }                                                                           \
-  static int gtest_##prefix##test_suite_name##_dummy_                         \
-      GTEST_ATTRIBUTE_UNUSED_ =                                               \
-          ::testing::UnitTest::GetInstance()                                  \
-              ->parameterized_test_registry()                                 \
-              .GetTestSuitePatternHolder<test_suite_name>(                    \
-                  GTEST_STRINGIFY_(test_suite_name),                          \
-                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
-              ->AddTestSuiteInstantiation(                                    \
-                  GTEST_STRINGIFY_(prefix),                                   \
-                  &gtest_##prefix##test_suite_name##_EvalGenerator_,          \
-                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
                   __FILE__, __LINE__)
 
-
 // Allow Marking a Parameterized test class as not needing to be instantiated.
-#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                   \
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
   namespace gtest_do_not_use_outside_namespace_scope {}                   \
   static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
       GTEST_STRINGIFY_(T))
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
index 076c9de1f..a91e8b8b1 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -95,7 +94,9 @@
 // being defined as many user-defined container types don't have
 // value_type.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
@@ -257,12 +258,10 @@ struct ConvertibleToStringViewPrinter {
 #endif
 };
 
-
 // Prints the given number of bytes in the given object to the given
 // ostream.
 GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
+                                     size_t count, ::std::ostream* os);
 struct RawBytesPrinter {
   // SFINAE on `sizeof` to make sure we have a complete type.
   template <typename T, size_t = sizeof(T)>
@@ -360,7 +359,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
 #endif
@@ -375,12 +374,12 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 // to point to a NUL-terminated string, and thus can print it as a string.
 
 #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
+  template <>                                                            \
+  class FormatForComparison<CharType*, OtherStringType> {                \
+   public:                                                               \
+    static ::std::string Format(CharType* value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
@@ -410,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
+std::string FormatForComparisonFailureMessage(const T1& value,
+                                              const T2& /* other_operand */) {
   return FormatForComparison<T1, T2>::Format(value);
 }
 
@@ -479,6 +478,12 @@ inline void PrintTo(char8_t c, ::std::ostream* os) {
 }
 #endif
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif  // __SIZEOF_INT128__
+
 // Overloads for C strings.
 GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
 inline void PrintTo(char* s, ::std::ostream* os) {
@@ -545,7 +550,7 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
 }
 
 // Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
 inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
   PrintStringTo(s, os);
 }
@@ -572,7 +577,7 @@ inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
 
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
 inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
   PrintWideStringTo(s, os);
 }
@@ -587,6 +592,12 @@ inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
 
 inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
 
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+  *os << internal::GetTypeName(info);
+}
+#endif  // GTEST_HAS_RTTI
+
 template <typename T>
 void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
   UniversalPrinter<T&>::Print(ref.get(), os);
@@ -744,6 +755,14 @@ class UniversalPrinter<Optional<T>> {
   }
 };
 
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+  static void Print(decltype(Nullopt()), ::std::ostream* os) {
+    *os << "(nullopt)";
+  }
+};
+
 #endif  // GTEST_INTERNAL_HAS_OPTIONAL
 
 #if GTEST_INTERNAL_HAS_VARIANT
@@ -802,8 +821,8 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+                                    ::std::ostream* os);
 
 #ifdef __cpp_char8_t
 // This overload prints a (const) char8_t array compactly.
@@ -820,8 +839,8 @@ GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
                                     ::std::ostream* os);
 
 // This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+                                    ::std::ostream* os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -980,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector< ::std::string> Strings;
+typedef ::std::vector<::std::string> Strings;
 
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
 template <typename Tuple>
 void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
                                Strings*) {}
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
index eacef4466..bec8c4810 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
@@ -27,12 +27,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
-// GOOGLETEST_CM0004 DO NOT DELETE
-
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
@@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
   TestPartResultReporterInterface* old_reporter_;
   TestPartResultArray* const result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+  ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+      delete;
+  ScopedFakeTestPartResultReporter& operator=(
+      const ScopedFakeTestPartResultReporter&) = delete;
 };
 
 namespace internal {
@@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker {
   SingleFailureChecker(const TestPartResultArray* results,
                        TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
+
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
   const std::string substr_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+  SingleFailureChecker(const SingleFailureChecker&) = delete;
+  SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
 };
 
 }  // namespace internal
@@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker {
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
 // A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ).  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
 // being part of the failure message.
 //
@@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // helper macro, due to some peculiarity in how the preprocessor
 // works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
 // gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 // A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
 //
 // There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
 // affects and considers failures generated in the current thread and
@@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // instead of
 //   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 // to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
         &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
           ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
index 203fdf98c..09cc8c34f 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
@@ -26,14 +26,17 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 
@@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray {
  private:
   std::vector<TestPartResult> array_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+  TestPartResultArray(const TestPartResultArray&) = delete;
+  TestPartResultArray& operator=(const TestPartResultArray&) = delete;
 };
 
 // This interface knows how to report a test part result.
@@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper
   ~HasNewFatalFailureHelper() override;
   void ReportTestPartResult(const TestPartResult& result) override;
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
  private:
   bool has_new_fatal_failure_;
   TestPartResultReporterInterface* original_reporter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+  HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+  HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
 };
 
 }  // namespace internal
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
index 9fdc6be10..bd35a3266 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
@@ -27,7 +27,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -190,7 +192,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   typedef ::testing::internal::GenerateTypeList<Types>::type            \
       GTEST_TYPE_PARAMS_(CaseName);                                     \
   typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
-      GTEST_NAME_GENERATOR_(CaseName)
+  GTEST_NAME_GENERATOR_(CaseName)
 
 #define TYPED_TEST(CaseName, TestName)                                        \
   static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
@@ -256,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 // #included in multiple translation units linked together.
 #define TYPED_TEST_SUITE_P(SuiteName)              \
   static ::testing::internal::TypedTestSuitePState \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+  GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -301,21 +303,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   REGISTER_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
-  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                       \
-                "test-suit-prefix must not be empty");                      \
-  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
-      ::testing::internal::TypeParameterizedTestSuite<                      \
-          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
-          ::testing::internal::GenerateTypeList<Types>::type>::             \
-          Register(GTEST_STRINGIFY_(Prefix),                                \
-                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
-                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),             \
-                   GTEST_STRINGIFY_(SuiteName),                             \
-                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
-                   ::testing::internal::GenerateNames<                      \
-                       ::testing::internal::NameGeneratorSelector<          \
-                           __VA_ARGS__>::type,                              \
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
                        ::testing::internal::GenerateTypeList<Types>::type>())
 
 // Legacy API is deprecated but still available
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest.h b/third_party/googletest/src/googletest/include/gtest/gtest.h
index 7a5d057c4..d19a587a1 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
@@ -47,8 +46,6 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
-
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
@@ -59,31 +56,22 @@
 #include <type_traits>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-assertion-result.h"
 #include "gtest/gtest-death-test.h"
 #include "gtest/gtest-matchers.h"
 #include "gtest/gtest-message.h"
 #include "gtest/gtest-param-test.h"
 #include "gtest/gtest-printers.h"
-#include "gtest/gtest_prod.h"
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
-// Silence C4100 (unreferenced formal parameter) and 4805
-// unsafe mix of type 'const int' and type 'const bool'
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4805)
-# pragma warning(disable:4100)
-#endif
-
-
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -138,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed);
 // is 1. If the value is -1 the tests are repeating forever.
 GTEST_DECLARE_int32_(repeat);
 
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
 // This flag controls whether Google Test includes Google Test internal
 // stack frames in failure stack traces.
 GTEST_DECLARE_bool_(show_internal_stack_frames);
@@ -163,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to);
 GTEST_DECLARE_string_(flagfile);
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -201,193 +205,6 @@ using TestCase = TestSuite;
 class TestInfo;
 class UnitTest;
 
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-
-// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
-// This warning is not emitted in Visual Studio 2017.
-// This warning is off by default starting in Visual Studio 2019 but can be
-// enabled with command-line options.
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
-#endif
-
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  //
-  // T must be contextually convertible to bool.
-  //
-  // The second parameter prevents this overload from being considered if
-  // the argument is implicitly convertible to AssertionResult. In that case
-  // we want AssertionResult's copy constructor to be used.
-  template <typename T>
-  explicit AssertionResult(
-      const T& success,
-      typename std::enable_if<
-          !std::is_convertible<T, AssertionResult>::value>::type*
-      /*enabler*/
-      = nullptr)
-      : success_(success) {}
-
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-  // Assignment operator.
-  AssertionResult& operator=(AssertionResult other) {
-    swap(other);
-    return *this;
-  }
-
-  // Returns true if and only if the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != nullptr ? message_->c_str() : "";
-  }
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == nullptr) message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult& other);
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  std::unique_ptr< ::std::string> message_;
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-}  // namespace testing
-
-// Includes the auto-generated header that implements a family of generic
-// predicate assertion macros. This include comes late because it relies on
-// APIs declared above.
-#include "gtest/gtest_pred_impl.h"
-
-namespace testing {
-
 // The abstract class that all tests inherit from.
 //
 // In Google Test, a unit test program contains one or many TestSuites, and
@@ -522,7 +339,8 @@ class GTEST_API_ Test {
   virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 
   // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+  Test(const Test&) = delete;
+  Test& operator=(const Test&) = delete;
 };
 
 typedef internal::TimeInMillis TimeInMillis;
@@ -536,24 +354,17 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  TestProperty(const std::string& a_key, const std::string& a_value)
+      : key_(a_key), value_(a_value) {}
 
   // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+  const char* key() const { return key_.c_str(); }
 
   // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+  const char* value() const { return value_.c_str(); }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+  void SetValue(const std::string& new_value) { value_ = new_value; }
 
  private:
   // The key supplied by the user.
@@ -687,7 +498,8 @@ class GTEST_API_ TestResult {
   TimeInMillis elapsed_time_;
 
   // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+  TestResult(const TestResult&) = delete;
+  TestResult& operator=(const TestResult&) = delete;
 };  // class TestResult
 
 // A TestInfo object stores the following information about a test:
@@ -811,8 +623,8 @@ class GTEST_API_ TestInfo {
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_suite_name_;    // test suite name
-  const std::string name_;               // Test name
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const std::unique_ptr<const ::std::string> type_param_;
@@ -833,7 +645,8 @@ class GTEST_API_ TestInfo {
   // test for the second time.
   TestResult result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+  TestInfo(const TestInfo&) = delete;
+  TestInfo& operator=(const TestInfo&) = delete;
 };
 
 // A test suite, which consists of a vector of TestInfos.
@@ -941,7 +754,7 @@ class GTEST_API_ TestSuite {
 
   // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
   // destruction of the TestSuite object.
-  void AddTestInfo(TestInfo * test_info);
+  void AddTestInfo(TestInfo* test_info);
 
   // Clears the results of all tests in this test suite.
   void ClearResult();
@@ -1042,7 +855,8 @@ class GTEST_API_ TestSuite {
   TestResult ad_hoc_test_result_;
 
   // We disallow copying TestSuites.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+  TestSuite(const TestSuite&) = delete;
+  TestSuite& operator=(const TestSuite&) = delete;
 };
 
 // An Environment object is capable of setting up and tearing down an
@@ -1069,6 +883,7 @@ class Environment {
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
+
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
@@ -1120,6 +935,9 @@ class TestEventListener {
   // Fired before the test starts.
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
+  // Fired when a test is disabled
+  virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
   // Fired after a failed assertion or a SUCCEED() invocation.
   // If you want to throw an exception from this function to skip to the next
   // TEST, it must be AssertionException defined above, or inherited from it.
@@ -1143,8 +961,7 @@ class TestEventListener {
   virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
 
   // Fired after all test activities have ended.
   virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
@@ -1169,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
   void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
   void OnTestEnd(const TestInfo& /*test_info*/) override {}
   void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
@@ -1258,7 +1076,8 @@ class GTEST_API_ TestEventListeners {
   TestEventListener* default_xml_generator_;
 
   // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+  TestEventListeners(const TestEventListeners&) = delete;
+  TestEventListeners& operator=(const TestEventListeners&) = delete;
 };
 
 // A UnitTest consists of a vector of TestSuites.
@@ -1301,8 +1120,7 @@ class GTEST_API_ UnitTest {
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
@@ -1408,8 +1226,7 @@ class GTEST_API_ UnitTest {
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
+                         const char* file_name, int line_number,
                          const std::string& message,
                          const std::string& os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
@@ -1440,8 +1257,7 @@ class GTEST_API_ UnitTest {
   friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
   friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
+      TestPartResult::Type result_type, const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1455,8 +1271,7 @@ class GTEST_API_ UnitTest {
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1469,7 +1284,8 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl_;
 
   // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+  UnitTest(const UnitTest&) = delete;
+  UnitTest& operator=(const UnitTest&) = delete;
 };
 
 // A convenient wrapper for adding an environment for the test
@@ -1520,13 +1336,11 @@ namespace internal {
 // when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQFailure(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const T1& lhs, const T2& rhs) {
-  return EqFailure(lhs_expression,
-                   rhs_expression,
+                                   const char* rhs_expression, const T1& lhs,
+                                   const T2& rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
 // This block of code defines operator==/!=
@@ -1539,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; }
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            const T1& lhs,
+                            const char* rhs_expression, const T1& lhs,
                             const T2& rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
@@ -1571,8 +1384,7 @@ class EqHelper {
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
   static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 BiggestInt lhs,
+                                 const char* rhs_expression, BiggestInt lhs,
                                  BiggestInt rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
@@ -1607,16 +1419,16 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
-  }\
-}
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                     const T1& val1, const T2& val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
@@ -1638,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >)
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
+                                              const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
+                                              const char* s1, const char* s2);
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 }  // namespace internal
 
@@ -1692,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const char* needle,
+                                       const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const wchar_t* needle,
+                                       const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const char* needle,
+                                          const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const wchar_t* needle,
+                                          const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::string& needle,
+                                       const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::string& needle,
+                                          const ::std::string& haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::wstring& needle,
+                                       const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::wstring& needle,
+                                          const ::std::wstring& haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -1732,8 +1545,7 @@ namespace internal {
 template <typename RawType>
 AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
                                          const char* rhs_expression,
-                                         RawType lhs_value,
-                                         RawType rhs_value) {
+                                         RawType lhs_value, RawType rhs_value) {
   const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
@@ -1748,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
   rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
          << rhs_value;
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -1761,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
 GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
                                                 const char* expr2,
                                                 const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
+                                                double val1, double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1770,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
+  AssertHelper(TestPartResult::Type type, const char* file, int line,
                const char* message);
   ~AssertHelper();
 
@@ -1786,11 +1593,9 @@ class GTEST_API_ AssertHelper {
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
+    AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
                      const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
+        : type(t), file(srcfile), line(line_num), message(msg) {}
 
     TestPartResult::Type const type;
     const char* const file;
@@ -1798,12 +1603,14 @@ class GTEST_API_ AssertHelper {
     std::string const message;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+    AssertHelperData(const AssertHelperData&) = delete;
+    AssertHelperData& operator=(const AssertHelperData&) = delete;
   };
 
   AssertHelperData* const data_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+  AssertHelper(const AssertHelper&) = delete;
+  AssertHelper& operator=(const AssertHelper&) = delete;
 };
 
 }  // namespace internal
@@ -1860,15 +1667,14 @@ class WithParamInterface {
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
+  static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
 
   // Static value used for accessing parameter during a test lifetime.
   static const ParamType* parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
@@ -1878,8 +1684,7 @@ const T* WithParamInterface<T>::parameter_ = nullptr;
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
+class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Macros for indicating success/failure in test code.
 
@@ -1910,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line)        \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
@@ -1925,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -1934,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -1962,16 +1767,15 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define GTEST_EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition)                      \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define GTEST_EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define GTEST_ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define GTEST_ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
@@ -2070,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -2115,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define EXPECT_STRCASEEQ(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASENE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 #define ASSERT_STREQ(s1, s2) \
@@ -2124,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define ASSERT_STRCASEEQ(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASENE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
@@ -2141,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(val1, val2)\
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define EXPECT_DOUBLE_EQ(val1, val2)\
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define ASSERT_FLOAT_EQ(val1, val2)\
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define ASSERT_DOUBLE_EQ(val1, val2)\
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2177,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
 GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
                                     double val1, double val2);
 
-
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -2189,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -2214,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
 // Causes a trace (including the given source file path and line number,
 // and the given message) to be included in every test failure message generated
@@ -2258,7 +2061,8 @@ class GTEST_API_ ScopedTrace {
  private:
   void PushTrace(const char* file, int line, std::string message);
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+  ScopedTrace(const ScopedTrace&) = delete;
+  ScopedTrace& operator=(const ScopedTrace&) = delete;
 } GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
                             // c'tor and d'tor.  Therefore it doesn't
                             // need to be used otherwise.
@@ -2278,9 +2082,9 @@ class GTEST_API_ ScopedTrace {
 // Assuming that each thread maintains its own stack of traces.
 // Therefore, a SCOPED_TRACE() would (correctly) only affect the
 // assertions in its own thread.
-#define SCOPED_TRACE(message) \
-  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, (message))
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
@@ -2378,20 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept {
 //     EXPECT_EQ(a_.size(), 0);
 //     EXPECT_EQ(b_.size(), 1);
 //   }
-//
-// GOOGLETEST_CM0011 DO NOT DELETE
-#if !GTEST_DONT_DEFINE_TEST
-#define TEST_F(test_fixture, test_name)\
+#define GTEST_TEST_F(test_fixture, test_name)        \
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
-#endif  // !GTEST_DONT_DEFINE_TEST
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
 
 // Returns a path to temporary directory.
 // Tries to determine an appropriate directory for the platform.
 GTEST_API_ std::string TempDir();
 
 #ifdef _MSC_VER
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 // Dynamically registers a test with the framework.
@@ -2445,6 +2248,7 @@ GTEST_API_ std::string TempDir();
 // }
 // ...
 // int main(int argc, char** argv) {
+//   ::testing::InitGoogleTest(&argc, argv);
 //   std::vector<int> values_to_test = LoadValuesFromConfig();
 //   RegisterMyTests(values_to_test);
 //   ...
@@ -2486,9 +2290,7 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
index 5029a9bb0..47a24aa68 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
@@ -26,17 +26,19 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-#include "gtest/gtest.h"
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 
@@ -72,22 +74,18 @@ namespace testing {
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
 
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
   if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
+    ;                                                           \
+  else                                                          \
     on_failure(gtest_ar.failure_message())
 
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+                                  Pred pred, const T1& v1) {
   if (pred(v1)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
 
 // Unary predicate assertion macros.
 #define EXPECT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
 #define ASSERT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
 // Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+                                  const char* e2, Pred pred, const T1& v1,
                                   const T2& v2) {
   if (pred(v1, v2)) return AssertionSuccess();
 
@@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
 
 // Binary predicate assertion macros.
 #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 #define ASSERT_PRED2(pred, v1, v2) \
   GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3) {
   if (pred(v1, v2, v3)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
 
 // Ternary predicate assertion macros.
 #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 #define ASSERT_PRED3(pred, v1, v2, v3) \
   GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, Pred pred, const T1& v1,
+                                  const T2& v2, const T3& v3, const T4& v4) {
   if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
 
 // 4-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 #define ASSERT_PRED4(pred, v1, v2, v3, v4) \
   GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
           typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, const char* e5, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3,
+                                  const T4& v4, const T5& v5) {
   if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
   GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
 
 // 5-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
   GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-
-
 }  // namespace testing
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
index 38b9d85a5..1f37dc31c 100644
--- a/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
+++ b/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
@@ -27,9 +27,8 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
-// Google C++ Testing and Mocking Framework definitions useful in production code.
-// GOOGLETEST_CM0003 DO NOT DELETE
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -55,7 +54,7 @@
 // Note: The test class must be in the same namespace as the class being tested.
 // For example, putting MyClassTest in an anonymous namespace will not work.
 
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md b/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
index ff391fb4e..cb49e2c75 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
+++ b/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
@@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations.
 
 The following macros can be defined:
 
-### Flag related macros:
-
-*   `GTEST_FLAG(flag_name)`
-*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
-    own flagfile flag parsing.
-*   `GTEST_DECLARE_bool_(name)`
-*   `GTEST_DECLARE_int32_(name)`
-*   `GTEST_DECLARE_string_(name)`
-*   `GTEST_DEFINE_bool_(name, default_val, doc)`
-*   `GTEST_DEFINE_int32_(name, default_val, doc)`
-*   `GTEST_DEFINE_string_(name, default_val, doc)`
-
 ### Logging:
 
 *   `GTEST_LOG_(severity)`
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
index db02881c0..9b7fb4261 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
@@ -34,4 +34,35 @@
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+  Notification() = delete;
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
+  // void Notify();
+  void WaitForNotification() {}
+};
+}  // namespace internal
+}  // namespace testing
+
 #endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
index 490296dfa..45580ae80 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -26,27 +26,31 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
+#include <stdio.h>
+
+#include <memory>
+
 #include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
 
-#include <stdio.h>
-#include <memory>
+GTEST_DECLARE_string_(internal_run_death_test);
 
 namespace testing {
 namespace internal {
 
-GTEST_DECLARE_string_(internal_run_death_test);
-
 // Names of the flags (needed for parsing Google Test flags).
 const char kDeathTestStyleFlag[] = "death_test_style";
 const char kDeathTestUseFork[] = "death_test_use_fork";
@@ -83,16 +87,18 @@ class GTEST_API_ DeathTest {
   static bool Create(const char* statement, Matcher<const std::string&> matcher,
                      const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest() { }
+  virtual ~DeathTest() {}
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
    public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    explicit ReturnSentinel(DeathTest* test) : test_(test) {}
     ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
    private:
     DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+    ReturnSentinel(const ReturnSentinel&) = delete;
+    ReturnSentinel& operator=(const ReturnSentinel&) = delete;
   } GTEST_ATTRIBUTE_UNUSED_;
 
   // An enumeration of possible roles that may be taken when a death
@@ -137,7 +143,8 @@ class GTEST_API_ DeathTest {
   // A string containing a description of the outcome of the last death test.
   static std::string last_death_test_message_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+  DeathTest(const DeathTest&) = delete;
+  DeathTest& operator=(const DeathTest&) = delete;
 };
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
@@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() { }
+  virtual ~DeathTestFactory() {}
   virtual bool Create(const char* statement,
                       Matcher<const std::string&> matcher, const char* file,
                       int line, DeathTest** test) = 0;
@@ -186,28 +193,28 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 
 // Traps C++ exceptions escaping statement and reports them as test
 // failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception& gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
         ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
+  } catch (...) {                                                            \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
   }
 
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 
-# endif
+#endif
 
 // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
 // ASSERT_EXIT*, and EXPECT_EXIT*.
@@ -236,8 +243,6 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
           gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
           break;                                                               \
         }                                                                      \
-        default:                                                               \
-          break;                                                               \
       }                                                                        \
     }                                                                          \
   } else                                                                       \
@@ -265,16 +270,12 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
+  InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
                            int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
 
   ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
+    if (write_fd_ >= 0) posix::Close(write_fd_);
   }
 
   const std::string& file() const { return file_; }
@@ -288,7 +289,8 @@ class InternalRunDeathTestFlag {
   int index_;
   int write_fd_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+  InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+  InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
 };
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
index 0c033abc3..a2a60a962 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Google Test filepath utilities
 //
 // This header file declares classes and functions used internally by
@@ -35,7 +35,9 @@
 // This file is #included in gtest/internal/gtest-internal.h.
 // Do not include this header file separately!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
@@ -61,8 +63,8 @@ namespace internal {
 
 class GTEST_API_ FilePath {
  public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
 
   explicit FilePath(const std::string& pathname) : pathname_(pathname) {
     Normalize();
@@ -73,9 +75,7 @@ class GTEST_API_ FilePath {
     return *this;
   }
 
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
+  void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
 
   const std::string& string() const { return pathname_; }
   const char* c_str() const { return pathname_.c_str(); }
@@ -88,8 +88,7 @@ class GTEST_API_ FilePath {
   // than zero (e.g., 12), returns "dir/test_12.xml".
   // On Windows platform, uses \ as the separator rather than /.
   static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
+                               const FilePath& base_name, int number,
                                const char* extension);
 
   // Given directory = "dir", relative_path = "test.xml",
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
index f8cbdbd81..9b04e4c85 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
@@ -26,13 +26,15 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
@@ -40,19 +42,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
+
 #include <cstdint>
 #include <iomanip>
 #include <limits>
@@ -76,7 +79,7 @@
 // the current line number.  For more details, see
 // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
 
 // Stringifies its argument.
 // Work around a bug in visual studio which doesn't accept code like this:
@@ -98,21 +101,21 @@ namespace testing {
 
 // Forward declarations.
 
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test suites.
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
 
 template <typename T>
 ::std::string PrintToString(const T& value);
 
 namespace internal {
 
-struct TraceInfo;                      // Information about a trace point.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
 
 // The text used in failure messages to indicate the start of the
 // stack trace.
@@ -121,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[];
 // An IgnoredValue object can be implicitly constructed from ANY value.
 class IgnoredValue {
   struct Sink {};
+
  public:
   // This constructor template allows any value to be implicitly
   // converted to IgnoredValue.  The object has no data member and
@@ -136,13 +140,13 @@ class IgnoredValue {
 };
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+                                         const Message& user_msg);
 
 #if GTEST_HAS_EXCEPTIONS
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
-/* an exported class was derived from a class that was not exported */)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
 
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
@@ -181,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
 
 }  // namespace edit_distance
 
-// Calculate the diff between 'left' and 'right' and return it in unified diff
-// format.
-// If not null, stores in 'total_line_count' the total number of lines found
-// in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
-                                   const std::string& right,
-                                   size_t* total_line_count);
-
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
 //
@@ -212,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value);
 
 // This template class represents an IEEE floating-point number
 // (either single-precision or double-precision, depending on the
@@ -256,11 +250,11 @@ class FloatingPoint {
   // Constants.
 
   // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
+  static const size_t kBitCount = 8 * sizeof(RawType);
 
   // # of fraction bits in a number.
   static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
+      std::numeric_limits<RawType>::digits - 1;
 
   // # of exponent bits in a number.
   static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -269,8 +263,8 @@ class FloatingPoint {
   static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 
   // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
 
   // The mask for the exponent bits.
   static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -309,9 +303,7 @@ class FloatingPoint {
   }
 
   // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
 
   // Returns the maximum representable finite floating-point number.
   static RawType Max();
@@ -319,7 +311,7 @@ class FloatingPoint {
   // Non-static methods
 
   // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
+  const Bits& bits() const { return u_.bits_; }
 
   // Returns the exponent bits of this number.
   Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
@@ -348,8 +340,8 @@ class FloatingPoint {
     // a NAN must return false.
     if (is_nan() || rhs.is_nan()) return false;
 
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
   }
 
  private:
@@ -374,7 +366,7 @@ class FloatingPoint {
   //
   // Read http://en.wikipedia.org/wiki/Signed_number_representations
   // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+  static Bits SignAndMagnitudeToBiased(const Bits& sam) {
     if (kSignBitMask & sam) {
       // sam represents a negative number.
       return ~sam + 1;
@@ -386,8 +378,8 @@ class FloatingPoint {
 
   // Given two numbers in the sign-and-magnitude representation,
   // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+                                                     const Bits& sam2) {
     const Bits biased1 = SignAndMagnitudeToBiased(sam1);
     const Bits biased2 = SignAndMagnitudeToBiased(sam2);
     return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
@@ -399,9 +391,13 @@ class FloatingPoint {
 // We cannot use std::numeric_limits<T>::max() as it clashes with the max()
 // macro defined by <windows.h>.
 template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
 template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
 
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
@@ -461,7 +457,8 @@ class TestFactoryBase {
   TestFactoryBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+  TestFactoryBase(const TestFactoryBase&) = delete;
+  TestFactoryBase& operator=(const TestFactoryBase&) = delete;
 };
 
 // This class provides implementation of TeastFactoryBase interface.
@@ -510,11 +507,11 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
 
 template <typename T>
 //  Note that SuiteApiResolver inherits from T because
-//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. This way
 //  SuiteApiResolver can access them.
 struct SuiteApiResolver : T {
   // testing::Test is only forward declared at this point. So we make it a
-  // dependend class for the compiler to be OK with it.
+  // dependent class for the compiler to be OK with it.
   using Test =
       typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
 
@@ -654,7 +651,8 @@ inline const char* SkipComma(const char* str) {
   if (comma == nullptr) {
     return nullptr;
   }
-  while (IsSpace(*(++comma))) {}
+  while (IsSpace(*(++comma))) {
+  }
   return comma;
 }
 
@@ -668,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) {
 // Splits a given string on a given delimiter, populating a given
 // vector with the fields.
 void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest);
+                 ::std::vector<::std::string>* dest);
 
 // The default argument to the template below for the case when the user does
 // not provide a name generator.
@@ -781,13 +779,13 @@ class TypeParameterizedTestSuite {
                        const std::vector<std::string>& type_names =
                            GenerateNames<DefaultNameGenerator, Types>()) {
     RegisterTypeParameterizedTestSuiteInstantiation(case_name);
-    std::string test_name = StripTrailingSpaces(
-        GetPrefixUntilComma(test_names));
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
       fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
               case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(),
-                                 code_location.line).c_str());
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
       fflush(stderr);
       posix::Abort();
     }
@@ -831,8 +829,8 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+                                                       int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -881,7 +879,8 @@ class GTEST_API_ Random {
 
  private:
   uint32_t state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+  Random(const Random&) = delete;
+  Random& operator=(const Random&) = delete;
 };
 
 // Turns const U&, U&, const U, and U all into U.
@@ -954,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) {
 
 typedef char IsNotContainer;
 template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
 
 // Trait to detect whether a type T is a hash table.
 // The heuristic used is that the type contains an inner type `hasher` and does
@@ -1017,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+  return lhs == rhs;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
   return internal::ArrayEq(lhs, N, rhs);
 }
 
@@ -1031,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
 template <typename T, typename U>
 bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
   for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -1042,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
 template <typename Iter, typename Element>
 Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
   for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
+    if (internal::ArrayEq(*it, elem)) return it;
   }
   return end;
 }
@@ -1057,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
+inline void CopyArray(const T& from, U* to) {
+  *to = from;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
   internal::CopyArray(from, N, *to);
 }
 
@@ -1114,8 +1117,7 @@ class NativeArray {
   }
 
   ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef)
-      delete[] array_;
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
   }
 
   // STL-style container methods.
@@ -1123,8 +1125,7 @@ class NativeArray {
   const_iterator begin() const { return array_; }
   const_iterator end() const { return array_ + size_; }
   bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
   }
 
  private:
@@ -1335,9 +1336,9 @@ struct tuple_size<testing::internal::FlatTuple<Ts...>>
 #endif
 }  // namespace std
 
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
 
 #define GTEST_MESSAGE_(message, result_type) \
   GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1458,103 +1459,112 @@ class NeverThrown {
 
 #endif  // GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::TrueWithString gtest_msg{}) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
-    catch (...) { \
-      gtest_msg.value = "it throws."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail(("Expected: " #statement " doesn't throw an exception.\n" \
-            "  Actual: " + gtest_msg.value).c_str())
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
       goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
 // representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
     ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
 
 // Expands to the name of the class that implements the given test.
 #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
   test_suite_name##_##test_name##_Test
 
 // Helper macro for defining tests.
-#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                \
-                "test_suite_name must not be empty");                         \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                      \
-                "test_name must not be empty");                               \
-  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;           \
-    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-    GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-                                                                              \
-   private:                                                                   \
-    void TestBody() override;                                                 \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_suite_name, #test_name, nullptr, nullptr,                     \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_suite_name, test_name)>);                                  \
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)       \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                 \
+                "test_suite_name must not be empty");                          \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                       \
+                "test_name must not be empty");                                \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public parent_class {                                                  \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;            \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default;  \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        GTEST_TEST_CLASS_NAME_(test_suite_name,                                \
+                               test_name) &&) noexcept = delete; /* NOLINT */  \
+                                                                               \
+   private:                                                                    \
+    void TestBody() override;                                                  \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;      \
+  };                                                                           \
+                                                                               \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,           \
+                                                    test_name)::test_info_ =   \
+      ::testing::internal::MakeAndRegisterTestInfo(                            \
+          #test_suite_name, #test_name, nullptr, nullptr,                      \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id),  \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),          \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),       \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(     \
+              test_suite_name, test_name)>);                                   \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
index c2ef6e312..e7af2f904 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
@@ -27,10 +27,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Type and function utilities for implementing parameterized tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
@@ -46,19 +47,18 @@
 #include <utility>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-printers.h"
 #include "gtest/gtest-test-part.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 // Input to a parameterized test name generator, describing a test parameter.
 // Consists of the parameter value and the integer parameter index.
 template <class ParamType>
 struct TestParamInfo {
-  TestParamInfo(const ParamType& a_param, size_t an_index) :
-    param(a_param),
-    index(an_index) {}
+  TestParamInfo(const ParamType& a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
   ParamType param;
   size_t index;
 };
@@ -84,8 +84,10 @@ namespace internal {
 GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
                                            CodeLocation code_location);
 
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
 
 // Interface for iterating over elements provided by an implementation
 // of ParamGeneratorInterface<T>.
@@ -129,8 +131,7 @@ class ParamIterator {
   // ParamIterator assumes ownership of the impl_ pointer.
   ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
   ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
+    if (this != &other) impl_.reset(other.impl_->Clone());
     return *this;
   }
 
@@ -157,7 +158,7 @@ class ParamIterator {
  private:
   friend class ParamGenerator<T>;
   explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  std::unique_ptr<ParamIteratorInterface<T> > impl_;
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
 };
 
 // ParamGeneratorInterface<T> is the binary interface to access generators
@@ -179,7 +180,7 @@ class ParamGeneratorInterface {
 // This class implements copy initialization semantics and the contained
 // ParamGeneratorInterface<T> instance is shared among all copies
 // of the original object. This is possible because that instance is immutable.
-template<typename T>
+template <typename T>
 class ParamGenerator {
  public:
   typedef ParamIterator<T> iterator;
@@ -196,7 +197,7 @@ class ParamGenerator {
   iterator end() const { return iterator(impl_->End()); }
 
  private:
-  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
 };
 
 // Generates values from a range of two comparable values. Can be used to
@@ -207,8 +208,10 @@ template <typename T, typename IncrementT>
 class RangeGenerator : public ParamGeneratorInterface<T> {
  public:
   RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+      : begin_(begin),
+        end_(end),
+        step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
   ~RangeGenerator() override {}
 
   ParamIteratorInterface<T>* Begin() const override {
@@ -251,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
    private:
     Iterator(const Iterator& other)
         : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
+          base_(other.base_),
+          value_(other.value_),
+          index_(other.index_),
           step_(other.step_) {}
 
     // No implementation - assignment is unsupported.
@@ -263,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
     const IncrementT step_;
   };  // class RangeGenerator::Iterator
 
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
+  static int CalculateEndIndex(const T& begin, const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step))
-      end_index++;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
     return end_index;
   }
 
@@ -283,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
   const int end_index_;
 };  // class RangeGenerator
 
-
 // Generates values from a pair of STL-style iterators. Used in the
 // ValuesIn() function. The elements are copied from the source range
 // since the source can be located on the stack, and the generator
@@ -341,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
       return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
     }
 
    private:
     Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
         : ParamIteratorInterface<T>(),
           base_(other.base_),
           iterator_(other.iterator_) {}
@@ -394,8 +396,8 @@ template <class TestClass>
 class ParameterizedTestFactory : public TestFactoryBase {
  public:
   typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
   Test* CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
@@ -404,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase {
  private:
   const ParamType parameter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+  ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+  ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -440,7 +443,8 @@ class TestMetaFactory
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+  TestMetaFactory(const TestMetaFactory&) = delete;
+  TestMetaFactory& operator=(const TestMetaFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -471,7 +475,10 @@ class ParameterizedTestSuiteInfoBase {
   ParameterizedTestSuiteInfoBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+  ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+      delete;
+  ParameterizedTestSuiteInfoBase& operator=(
+      const ParameterizedTestSuiteInfoBase&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -547,8 +554,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
          test_it != tests_.end(); ++test_it) {
       std::shared_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
         const std::string& instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
         ParamNameGeneratorFunc* name_func = gen_it->name_func;
@@ -556,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
         int line = gen_it->line;
 
         std::string test_suite_name;
-        if ( !instantiation_name.empty() )
+        if (!instantiation_name.empty())
           test_suite_name = instantiation_name + "/";
         test_suite_name += test_info->test_suite_base_name;
 
@@ -569,17 +576,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
           Message test_name_stream;
 
-          std::string param_name = name_func(
-              TestParamInfo<ParamType>(*param_it, i));
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
 
           GTEST_CHECK_(IsValidParamName(param_name))
               << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file
-              << " line " << line << std::endl;
+              << "' is invalid, in " << file << " line " << line << std::endl;
 
           GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name
-              << "', in " << file << " line " << line << std::endl;
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
 
           test_param_names.insert(param_name);
 
@@ -596,15 +602,15 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
               test_info->test_meta_factory->CreateTestFactory(*param_it));
         }  // for param_it
-      }  // for gen_it
-    }  // for test_it
+      }    // for gen_it
+    }      // for test_it
 
     if (!generated_instantiations) {
       // There are no generaotrs, or they all generate nothing ...
       InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
                               !tests_.empty());
     }
-  }    // RegisterTests
+  }  // RegisterTests
 
  private:
   // LocalTestInfo structure keeps information about a single test registered
@@ -620,42 +626,39 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
-    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
     const CodeLocation code_location;
   };
-  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
   //  <Instantiation name, Sequence generator creation function,
   //     Name generator function, Source file, Source line>
   struct InstantiationInfo {
-      InstantiationInfo(const std::string &name_in,
-                        GeneratorCreationFunc* generator_in,
-                        ParamNameGeneratorFunc* name_func_in,
-                        const char* file_in,
-                        int line_in)
-          : name(name_in),
-            generator(generator_in),
-            name_func(name_func_in),
-            file(file_in),
-            line(line_in) {}
-
-      std::string name;
-      GeneratorCreationFunc* generator;
-      ParamNameGeneratorFunc* name_func;
-      const char* file;
-      int line;
+    InstantiationInfo(const std::string& name_in,
+                      GeneratorCreationFunc* generator_in,
+                      ParamNameGeneratorFunc* name_func_in, const char* file_in,
+                      int line_in)
+        : name(name_in),
+          generator(generator_in),
+          name_func(name_func_in),
+          file(file_in),
+          line(line_in) {}
+
+    std::string name;
+    GeneratorCreationFunc* generator;
+    ParamNameGeneratorFunc* name_func;
+    const char* file;
+    int line;
   };
   typedef ::std::vector<InstantiationInfo> InstantiationContainer;
 
   static bool IsValidParamName(const std::string& name) {
     // Check for empty string
-    if (name.empty())
-      return false;
+    if (name.empty()) return false;
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!IsAlNum(name[index]) && name[index] != '_')
-        return false;
+      if (!IsAlNum(name[index]) && name[index] != '_') return false;
     }
 
     return true;
@@ -666,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+  ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+  ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+      delete;
 };  // class ParameterizedTestSuiteInfo
 
 //  Legacy API is deprecated but still available
@@ -709,7 +714,7 @@ class ParameterizedTestSuiteRegistry {
           // type we are looking for, so we downcast it to that type
           // without further checks.
           typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
         }
         break;
       }
@@ -741,7 +746,10 @@ class ParameterizedTestSuiteRegistry {
 
   TestSuiteInfoContainer test_suite_infos_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+  ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+      delete;
+  ParameterizedTestSuiteRegistry& operator=(
+      const ParameterizedTestSuiteRegistry&) = delete;
 };
 
 // Keep track of what type-parameterized test suite are defined and
@@ -836,7 +844,8 @@ class CartesianProductGenerator
       : public ParamIteratorInterface<ParamType> {
    public:
     IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
-             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+                 const std::tuple<ParamGenerator<T>...>& generators,
+                 bool is_end)
         : base_(base),
           begin_(std::get<I>(generators).begin()...),
           end_(std::get<I>(generators).end()...),
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
index dd845915e..f025db76a 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the GTEST_OS_* macro.
@@ -37,70 +37,72 @@
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-#  define GTEST_OS_WINDOWS 1
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
 #elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(WINAPI_FAMILY)
-#  include <winapifamily.h>
-#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#   define GTEST_OS_WINDOWS_RT 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#   define GTEST_OS_WINDOWS_TV_TITLE 1
-#  else
-    // WINAPI_FAMILY defined but no known partition matched.
-    // Default to desktop.
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  endif
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
 #elif defined __OS2__
-# define GTEST_OS_OS2 1
+#define GTEST_OS_OS2 1
 #elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# include <TargetConditionals.h>
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-# endif
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
 #elif defined __DragonFly__
-# define GTEST_OS_DRAGONFLY 1
+#define GTEST_OS_DRAGONFLY 1
 #elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
+#define GTEST_OS_FREEBSD 1
 #elif defined __Fuchsia__
-# define GTEST_OS_FUCHSIA 1
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-# define GTEST_OS_GNU_KFREEBSD 1
+#define GTEST_OS_GNU_KFREEBSD 1
 #elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
 #elif defined __MVS__
-# define GTEST_OS_ZOS 1
+#define GTEST_OS_ZOS 1
 #elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
+#define GTEST_OS_SOLARIS 1
 #elif defined(_AIX)
-# define GTEST_OS_AIX 1
+#define GTEST_OS_AIX 1
 #elif defined(__hpux)
-# define GTEST_OS_HPUX 1
+#define GTEST_OS_HPUX 1
 #elif defined __native_client__
-# define GTEST_OS_NACL 1
+#define GTEST_OS_NACL 1
 #elif defined __NetBSD__
-# define GTEST_OS_NETBSD 1
+#define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
+#define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
-# define GTEST_OS_QNX 1
+#define GTEST_OS_QNX 1
 #elif defined(__HAIKU__)
 #define GTEST_OS_HAIKU 1
 #elif defined ESP8266
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
index 0953a781c..0003d2765 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -38,7 +38,9 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
@@ -116,6 +118,7 @@
 //   GTEST_OS_DRAGONFLY - DragonFlyBSD
 //   GTEST_OS_FREEBSD  - FreeBSD
 //   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_HURD - GNU/Hurd
 //   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
 //   GTEST_OS_HAIKU    - Haiku
 //   GTEST_OS_HPUX     - HP-UX
@@ -167,7 +170,7 @@
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
-//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_RE2         - the RE2 regular expression library is used
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
@@ -190,10 +193,6 @@
 //   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
 //   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
 //                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables copy operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_DISALLOW_MOVE_ASSIGN_   - disables move operator=.
-//   GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=.
 //   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
 //   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
 //                                        suppressed (constant conditional).
@@ -217,11 +216,13 @@
 //                            - synchronization primitives.
 //
 // Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like platforms
-//                    GOOGLETEST_CM0008 DO NOT DELETE
-//                    or a reduced regular exception syntax on other
-//                    platforms, including Windows.
+//   RE             - a simple regular expression class using
+//                     1) the RE2 syntax on all platforms when built with RE2
+//                        and Abseil as dependencies
+//                     2) the POSIX Extended Regular Expression syntax on
+//                        UNIX-like platforms,
+//                     3) A reduced regular exception syntax on other platforms,
+//                        including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -241,8 +242,6 @@
 //   BiggestInt     - the biggest signed integer type.
 //
 // Command-line utilities:
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
 //   GetInjectableArgvs() - returns the command line as a vector of strings.
 //
 // Environment variable utilities:
@@ -263,48 +262,55 @@
 #include <string.h>
 
 #include <cerrno>
+// #include <condition_variable>  // Guarded by GTEST_IS_THREADSAFE below
 #include <cstdint>
+#include <iostream>
 #include <limits>
+#include <locale>
+#include <memory>
+#include <string>
+// #include <mutex>  // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
 #include <type_traits>
+#include <vector>
 
 #ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #endif  // !_WIN32_WCE
 
 #if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
 #endif
 
-#include <iostream>  // NOLINT
-#include <locale>
-#include <memory>
-#include <string>  // NOLINT
-#include <tuple>
-#include <vector>  // NOLINT
-
 #include "gtest/internal/custom/gtest-port.h"
 #include "gtest/internal/gtest-port-arch.h"
 
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
+
 #if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
 #endif  // !defined(GTEST_DEV_EMAIL_)
 
 #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
 #endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
 // 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
 // Macros for disabling Microsoft Visual C++ warnings.
@@ -313,41 +319,37 @@
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
 #if defined(_MSC_VER)
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-    __pragma(warning(push))                        \
-    __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
-    __pragma(warning(pop))
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
 #else
 // Not all compilers are MSVC
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Clang on Windows does not understand MSVC's pragma warning.
 // We need clang-specific way to disable function deprecation warning.
 #ifdef __clang__
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
-    _Pragma("clang diagnostic push")                                  \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    _Pragma("clang diagnostic pop")
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
 #else
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
 #if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-#  include <direct.h>
-#  include <io.h>
-# endif
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
 // In order to avoid having to include <windows.h>, use forward declaration
 #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
 // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
@@ -367,68 +369,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
-# include <unistd.h>
-# include <strings.h>
+#include <strings.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_LINUX_ANDROID
 // Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
+#include <android/api-level.h>  // NOLINT
 #endif
 
 // Defines this to true if and only if Google Test can use POSIX regular
 // expressions.
 #ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
 // On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
 #define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
-# endif
+#endif
 #endif
 
-#if GTEST_USES_PCRE
-// The appropriate headers have already been included.
-
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
 #elif GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
+#include <regex.h>  // NOLINT
+#define GTEST_USES_POSIX_RE 1
 #else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_USES_PCRE
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) && defined(_CPPUNWIND)
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
 // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
 // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
 // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
 // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
 // there can be cleanups for ObjC exceptions which also need cleanups, even if
@@ -437,27 +426,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // cleanups prior to that. To reliably check for C++ exception availability with
 // clang, check for
 // __EXCEPTIONS && __has_feature(cxx_exceptions).
-#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
 // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
 // Sun Pro CC supports exceptions.  However, there is no compile-time way of
 // detecting whether they are enabled or not.  Therefore, we assume that
 // they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
 // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
 // Exception handling is in effect by default in HP aCC compiler. It has to
 // be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
+#define GTEST_HAS_EXCEPTIONS 1
+#else
 // For other compilers, we assume exceptions are disabled to be
 // conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
 #endif  // GTEST_HAS_EXCEPTIONS
 
 #ifndef GTEST_HAS_STD_WSTRING
@@ -477,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // The user didn't tell us whether RTTI is enabled, so we need to
 // figure it out.
 
-# ifdef _MSC_VER
+#ifdef _MSC_VER
 
 #ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
 // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
 // enabled.
-# elif defined(__GNUC__)
+#elif defined(__GNUC__)
 
-#  ifdef __GXX_RTTI
+#ifdef __GXX_RTTI
 // When building against STLport with the Android NDK and with
 // -frtti -fno-exceptions, the build fails at link time with undefined
 // references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
 // so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
 
 // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
 // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
 // first version with C++ support.
-# elif defined(__clang__)
+#elif defined(__clang__)
 
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
 
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
 
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
-# else
+#else
 
 // For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
+#define GTEST_HAS_RTTI 1
 
-# endif  // _MSC_VER
+#endif  // _MSC_VER
 
 #endif  // GTEST_HAS_RTTI
 
 // It's this header's responsibility to #include <typeinfo> when RTTI
 // is enabled.
 #if GTEST_HAS_RTTI
-# include <typeinfo>
+#include <typeinfo>
 #endif
 
 // Determines whether Google Test can use the pthreads library.
@@ -547,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
   (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
    GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
    GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
-   GTEST_OS_HAIKU)
+   GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
 // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
 // true.
-# include <pthread.h>  // NOLINT
+#include <pthread.h>  // NOLINT
 
 // For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
+#include <time.h>  // NOLINT
 #endif
 
 // Determines whether clone(2) is supported.
@@ -566,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_CLONE
 // The user didn't tell us, so we need to figure it out.
 
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
 // On Android, clone() became available at different API levels for each 32-bit
 // architecture.
-#    if defined(__LP64__) || \
-        (defined(__arm__) && __ANDROID_API__ >= 9) || \
-        (defined(__mips__) && __ANDROID_API__ >= 12) || \
-        (defined(__i386__) && __ANDROID_API__ >= 17)
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
 
 #endif  // GTEST_HAS_CLONE
 
@@ -594,10 +581,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // platforms except known mobile ones.
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
     GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
 // Determines whether to support death tests.
@@ -607,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
      (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
      GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
      GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
-     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-# define GTEST_HAS_DEATH_TEST 1
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU ||     \
+     GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
 #endif
 
 // Determines whether to support type-driven tests.
@@ -617,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Sun Pro CC, IBM Visual Age, and HP aCC support.
 #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
     defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
 #endif
 
 // Determines whether the system compiler uses UTF-16 for encoding wide strings.
@@ -627,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 // Determines whether test results can be streamed to a socket.
 #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
-    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# define GTEST_CAN_STREAM_RESULTS_ 1
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD ||       \
+    GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
 #endif
 
 // Defines some utility macros.
@@ -642,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 // The "switch (0) case 0:" idiom is used to suppress this.
 #ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
 #else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
 #endif
 
 // Use this annotation at the end of a struct/class definition to
@@ -659,55 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Also use it after a variable or parameter declaration to tell the
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
 #elif defined(__clang__)
-# if __has_attribute(unused)
-#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
 #endif
 #ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
 // Use this annotation before a function that takes a printf format string.
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-# if defined(__MINGW_PRINTF_FORMAT)
+#if defined(__MINGW_PRINTF_FORMAT)
 // MinGW has two different printf implementations. Ensure the format macro
 // matches the selected implementation. See
 // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
-                                 first_to_check)))
-# else
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__printf__, string_index, first_to_check)))
-# endif
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((                                             \
+      __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
 #else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
 #endif
-
-
-// A macro to disallow copy operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) \
-  type& operator=(type const &) = delete
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const&) = delete;                 \
-  type& operator=(type const&) = delete
-
-// A macro to disallow move operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
-  type& operator=(type &&) noexcept = delete
-
-// A macro to disallow move constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
-  type(type&&) noexcept = delete;             \
-  type& operator=(type&&) noexcept = delete
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -715,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 //   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
 #else
-# define GTEST_MUST_USE_RESULT_
+#define GTEST_MUST_USE_RESULT_
 #endif  // __GNUC__ && !COMPILER_ICC
 
 // MS C++ compiler emits warning when a conditional expression is compile time
@@ -728,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // while (true) {
 // GTEST_INTENTIONAL_CONST_COND_POP_()
 // }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
@@ -739,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_SEH
 // The user didn't tell us, so we need to figure it out.
 
-# if defined(_MSC_VER) || defined(__BORLANDC__)
+#if defined(_MSC_VER) || defined(__BORLANDC__)
 // These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
+#define GTEST_HAS_SEH 1
+#else
 // Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
+#define GTEST_HAS_SEH 0
+#endif
 
 #endif  // GTEST_HAS_SEH
 
@@ -758,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 #endif  // GTEST_IS_THREADSAFE
 
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#endif                         // GTEST_IS_THREADSAFE
+
 // GTEST_API_ qualifies all symbols that must be exported. The definitions below
 // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
 // gtest/internal/custom/gtest-port.h
 #ifndef GTEST_API_
 
 #ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
 #elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
+#define GTEST_API_ __attribute__((visibility("default")))
 #endif  // _MSC_VER
 
 #endif  // GTEST_API_
 
 #ifndef GTEST_API_
-# define GTEST_API_
+#define GTEST_API_
 #endif  // GTEST_API_
 
 #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
 #endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
+#define GTEST_NO_INLINE_ __attribute__((noinline))
 #else
-# define GTEST_NO_INLINE_
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+  __attribute__((optimize("no-optimize-sibling-calls")))
+#else
+#define GTEST_NO_TAIL_CALL_
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
 #if !defined(GTEST_HAS_CXXABI_H_)
-# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#  define GTEST_HAS_CXXABI_H_ 1
-# else
-#  define GTEST_HAS_CXXABI_H_ 0
-# endif
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
 // memory when built with MemorySanitizer.
 #if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
-       __attribute__((no_sanitize_memory))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif  // __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 #endif  // __clang__
 
 // A function level attribute to disable AddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(address_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-       __attribute__((no_sanitize_address))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif  // __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable HWAddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(hwaddress_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
-       __attribute__((no_sanitize("hwaddress")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-# endif  // __has_feature(hwaddress_sanitizer)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable ThreadSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
-       __attribute__((no_sanitize_thread))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif  // __has_feature(thread_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
 namespace testing {
@@ -867,25 +853,37 @@ namespace internal {
 // Secret object, which is what we want.
 class Secret;
 
-// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
-// time expression is true (in new code, use static_assert instead). For
-// example, you could use it to verify the size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
-//                         names_incorrect_size);
-//
-// The second argument to the macro must be a valid C++ identifier. If the
-// expression is false, compiler will issue an error containing this identifier.
-#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-
 // A helper for suppressing warnings on constant condition.  It just
 // returns 'condition'.
 GTEST_API_ bool IsTrue(bool condition);
 
 // Defines RE.
 
-#if GTEST_USES_PCRE
-// if used, PCRE is injected by custom/gtest-port.h
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+  RE(absl::string_view regex) : regex_(regex) {}                  // NOLINT
+  RE(const char* regex) : RE(absl::string_view(regex)) {}         // NOLINT
+  RE(const std::string& regex) : RE(absl::string_view(regex)) {}  // NOLINT
+  RE(const RE& other) : RE(other.pattern()) {}
+
+  const std::string& pattern() const { return regex_.pattern(); }
+
+  static bool FullMatch(absl::string_view str, const RE& re) {
+    return RE2::FullMatch(str, re.regex_);
+  }
+  static bool PartialMatch(absl::string_view str, const RE& re) {
+    return RE2::PartialMatch(str, re.regex_);
+  }
+
+ private:
+  RE2 regex_;
+};
+
 #elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
 
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
@@ -924,19 +922,19 @@ class GTEST_API_ RE {
   const char* pattern_;
   bool is_valid_;
 
-# if GTEST_USES_POSIX_RE
+#if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-# else  // GTEST_USES_SIMPLE_RE
+#else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-# endif
+#endif
 };
 
-#endif  // GTEST_USES_PCRE
+#endif  // ::testing::internal::RE implementation
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
@@ -954,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
 //   LogToStderr()  - directs all log messages to stderr.
 //   FlushInfoLog() - flushes informational log messages.
 
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
 
 // Formats log entry severity, provides a stream object for streaming the
 // log message, and terminates the message with a newline when going out of
@@ -976,14 +969,16 @@ class GTEST_API_ GTestLog {
  private:
   const GTestLogSeverity severity_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+  GTestLog(const GTestLog&) = delete;
+  GTestLog& operator=(const GTestLog&) = delete;
 };
 
 #if !defined(GTEST_LOG_)
 
-# define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(nullptr); }
@@ -995,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
 // is not satisfied.
-//  Synopsys:
+//  Synopsis:
 //    GTEST_CHECK_(boolean_condition);
 //     or
 //    GTEST_CHECK_(boolean_condition) << "Additional message";
@@ -1005,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
 #endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
@@ -1019,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // in {} if you need to use it as the only statement in an 'if'
 // branch.
 #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
 
 // Transforms "T" into "const T&" according to standard reference collapsing
 // rules (this is only needed as a backport for C++98 compilers that do not
@@ -1035,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // Note that the non-const reference will not have "const" added. This is
 // standard, and necessary so that "T" can always bind to "const T&".
 template <typename T>
-struct ConstRef { typedef const T& type; };
+struct ConstRef {
+  typedef const T& type;
+};
 template <typename T>
-struct ConstRef<T&> { typedef T& type; };
+struct ConstRef<T&> {
+  typedef T& type;
+};
 
 // The argument T must depend on some template parameters.
 #define GTEST_REFERENCE_TO_CONST_(T) \
@@ -1050,7 +1048,7 @@ struct ConstRef<T&> { typedef T& type; };
 // const Foo*).  When you use ImplicitCast_, the compiler checks that
 // the cast is safe.  Such explicit ImplicitCast_s are necessary in
 // surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
+// instead of an argument type convertible to a target type.
 //
 // The syntax for using ImplicitCast_ is the same as for static_cast:
 //
@@ -1063,8 +1061,10 @@ struct ConstRef<T&> { typedef T& type; };
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1087,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; }
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., down_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {         // so we only accept pointers
   // Ensures that To is a sub-type of From *.  This test is here only
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
-  GTEST_INTENTIONAL_CONST_COND_POP_()
-  const To to = nullptr;
-  ::testing::internal::ImplicitCast_<From*>(to);
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From*>(to);
   }
 
 #if GTEST_HAS_RTTI
@@ -1162,71 +1162,8 @@ void ClearInjectableArgvs();
 
 // Defines synchronization primitives.
 #if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
-// Sleeps for (roughly) n milliseconds.  This function is only for testing
-// Google Test's own constructs.  Don't use it in user tests, either
-// directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, nullptr);
-}
-# endif  // GTEST_HAS_PTHREAD
-
-# if GTEST_HAS_NOTIFICATION_
-// Notification has already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_HAS_PTHREAD
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-GTEST_API_ void SleepMilliseconds(int n);
 
+#if GTEST_OS_WINDOWS
 // Provides leak-safe Windows kernel handle ownership.
 // Used in death tests and in threading support.
 class GTEST_API_ AutoHandle {
@@ -1253,8 +1190,18 @@ class GTEST_API_ AutoHandle {
 
   Handle handle_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+  AutoHandle(const AutoHandle&) = delete;
+  AutoHandle& operator=(const AutoHandle&) = delete;
 };
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
@@ -1262,23 +1209,40 @@ class GTEST_API_ AutoHandle {
 //
 // This class is only for testing Google Test's own constructs. Do not
 // use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
 class GTEST_API_ Notification {
  public:
-  Notification();
-  void Notify();
-  void WaitForNotification();
+  Notification() : notified_(false) {}
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
 
- private:
-  AutoHandle event_;
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    std::lock_guard<std::mutex> lock(mu_);
+    notified_ = true;
+    cv_.notify_all();
+  }
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> lock(mu_);
+    cv_.wait(lock, [this]() { return notified_; });
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
 };
-# endif  // GTEST_HAS_NOTIFICATION_
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+#endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
 // has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1354,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase {
                    // finished.
   pthread_t thread_;  // The native thread object.
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
-# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 // Mutex and ThreadLocal have already been imported into the namespace.
 // Nothing to do here.
 
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
 // with class MutexLock:
@@ -1417,14 +1382,15 @@ class GTEST_API_ Mutex {
   long critical_section_init_phase_;  // NOLINT
   GTEST_CRITICAL_SECTION* critical_section_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::Mutex mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1433,15 +1399,15 @@ class GTEST_API_ Mutex {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   Mutex* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1468,7 +1434,8 @@ class ThreadLocalBase {
   virtual ~ThreadLocalBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+  ThreadLocalBase(const ThreadLocalBase&) = delete;
+  ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
 };
 
 // Maps a thread to a set of ThreadLocals that have values instantiated on that
@@ -1497,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase {
     virtual void Run() = 0;
   };
 
-  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
   virtual ~ThreadWithParamBase();
 
  private:
@@ -1511,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase {
   typedef void UserThreadFunc(T);
 
   ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
-  }
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
   virtual ~ThreadWithParam() {}
 
  private:
   class RunnableImpl : public Runnable {
    public:
-    RunnableImpl(UserThreadFunc* func, T param)
-        : func_(func),
-          param_(param) {
-    }
+    RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
     virtual ~RunnableImpl() {}
-    virtual void Run() {
-      func_(param_);
-    }
+    virtual void Run() { func_(param_); }
 
    private:
     UserThreadFunc* const func_;
     const T param_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+    RunnableImpl(const RunnableImpl&) = delete;
+    RunnableImpl& operator=(const RunnableImpl&) = delete;
   };
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
 
 // Implements thread-local storage on Windows systems.
@@ -1571,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase {
   explicit ThreadLocal(const T& value)
       : default_factory_(new InstanceValueHolderFactory(value)) {}
 
-  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+  ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
   T* pointer() { return GetOrCreateValue(); }
   const T* pointer() const { return GetOrCreateValue(); }
@@ -1590,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
-
   T* GetOrCreateValue() const {
     return static_cast<ValueHolder*>(
-        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
   }
 
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+  ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
     return default_factory_->MakeNewHolder();
   }
 
@@ -1610,7 +1574,8 @@ class ThreadLocal : public ThreadLocalBase {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1619,7 +1584,9 @@ class ThreadLocal : public ThreadLocalBase {
     ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1632,15 +1599,18 @@ class ThreadLocal : public ThreadLocalBase {
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
 
 // MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
@@ -1687,8 +1657,8 @@ class MutexBase {
 };
 
 // Forward-declares a static mutex.
-#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-     extern ::testing::internal::MutexBase mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
 // The initialization list here does not explicitly initialize each field,
@@ -1707,12 +1677,11 @@ class Mutex : public MutexBase {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
     has_owner_ = false;
   }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
 // We cannot name this class MutexLock because the ctor declaration would
@@ -1722,15 +1691,15 @@ class Mutex : public MutexBase {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   MutexBase* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1787,7 +1756,8 @@ class GTEST_API_ ThreadLocal {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
   static pthread_key_t CreateKey() {
@@ -1819,7 +1789,8 @@ class GTEST_API_ ThreadLocal {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1828,7 +1799,9 @@ class GTEST_API_ ThreadLocal {
     ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1841,17 +1814,20 @@ class GTEST_API_ ThreadLocal {
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -1868,10 +1844,10 @@ class Mutex {
   void AssertHeld() const {}
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
   extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1894,6 +1870,7 @@ class GTEST_API_ ThreadLocal {
   const T* pointer() const { return &value_; }
   const T& get() const { return value_; }
   void set(const T& value) { value_ = value; }
+
  private:
   T value_;
 };
@@ -1905,11 +1882,11 @@ class GTEST_API_ ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
 #else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
 #endif  // GTEST_OS_WINDOWS
 
 // Utilities for char.
@@ -1967,8 +1944,7 @@ inline char ToUpper(char ch) {
 
 inline std::string StripTrailingSpaces(std::string str) {
   std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it))
-    it = str.erase(it);
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
   return str;
 }
 
@@ -1986,36 +1962,35 @@ namespace posix {
 
 typedef struct _stat StatStruct;
 
-# ifdef __BORLANDC__
+#ifdef __BORLANDC__
 inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
 inline int DoIsATTY(int /* fd */) { return 0; }
-#  else
+#else
 inline int DoIsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
+#endif  // __BORLANDC__
 
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
 // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
 // time and thus not defined there.
-# else
+#else
 inline int FileNo(FILE* file) { return _fileno(file); }
 inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
 inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
 #elif GTEST_OS_ESP8266
 typedef struct stat StatStruct;
@@ -2079,12 +2054,12 @@ inline FILE* FOpen(const char* path, const char* mode) {
   std::wstring wide_path = converter.from_bytes(path);
   std::wstring wide_mode = converter.from_bytes(mode);
   return _wfopen(wide_path.c_str(), wide_mode.c_str());
-#else  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+#else   // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
 #endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
   return freopen(path, mode, stream);
 }
 inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
@@ -2136,13 +2111,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
 // snprintf is a variadic function.
 #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
 // MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
 #elif defined(_MSC_VER)
 // Windows CE does not define _snprintf_s
-# define GTEST_SNPRINTF_ _snprintf
+#define GTEST_SNPRINTF_ _snprintf
 #else
-# define GTEST_SNPRINTF_ snprintf
+#define GTEST_SNPRINTF_ snprintf
 #endif
 
 // The biggest signed integer type the compiler supports.
@@ -2202,37 +2177,84 @@ using TimeInMillis = int64_t;  // Represents time in milliseconds.
 
 // Macro for referencing flags.
 #if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
 #endif  // !defined(GTEST_FLAG)
 
-#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
-#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
 
-#if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+  ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+  ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+  ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
 
 // Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern std::int32_t GTEST_FLAG(name)
-# define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+#define GTEST_DECLARE_bool_(name) \
+  ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+  ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+  ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+  (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else  // GTEST_HAS_ABSL
 
 // Macros for defining flags.
-# define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_bool_(name, default_val, doc)  \
+  namespace testing {                               \
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc)         \
+  namespace testing {                                       \
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+  }                                                         \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc)         \
+  namespace testing {                                        \
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+  }                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
 
-#endif  // !defined(GTEST_DECLARE_bool_)
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name)          \
+  namespace testing {                      \
+  GTEST_API_ extern bool GTEST_FLAG(name); \
+  }                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name)                 \
+  namespace testing {                              \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+  }                                                \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name)                 \
+  namespace testing {                               \
+  GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif  // GTEST_HAS_ABSL
 
 // Thread annotations
 #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
 #endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
@@ -2308,6 +2330,7 @@ namespace testing {
 namespace internal {
 template <typename T>
 using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
 }  // namespace internal
 }  // namespace testing
 #else
@@ -2321,6 +2344,7 @@ namespace testing {
 namespace internal {
 template <typename T>
 using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
 }  // namespace internal
 }  // namespace testing
 // The case where absl is configured NOT to alias std::optional is not
@@ -2332,7 +2356,7 @@ using Optional = ::std::optional<T>;
 #if GTEST_HAS_ABSL
 // Always use absl::string_view for Matcher<> specializations if googletest
 // is built with absl support.
-# define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include "absl/strings/string_view.h"
 namespace testing {
 namespace internal {
@@ -2340,11 +2364,11 @@ using StringView = ::absl::string_view;
 }  // namespace internal
 }  // namespace testing
 #else
-# ifdef __has_include
-#   if __has_include(<string_view>) && __cplusplus >= 201703L
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
 // Otherwise for C++17 and higher use std::string_view for Matcher<>
 // specializations.
-#   define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include <string_view>
 namespace testing {
 namespace internal {
@@ -2353,8 +2377,8 @@ using StringView = ::std::string_view;
 }  // namespace testing
 // The case where absl is configured NOT to alias std::string_view is not
 // supported.
-#  endif  // __has_include(<string_view>) && __cplusplus >= 201703L
-# endif  // __has_include
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
 #endif  // GTEST_HAS_ABSL
 
 #if GTEST_HAS_ABSL
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
index 10f774f96..cca2e1f2a 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares the String class and functions used internally by
@@ -36,17 +36,20 @@
 // This header file is #included by gtest-internal.h.
 // It should not be #included by other files.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
+#include <mem.h>
 #endif
 
 #include <string.h>
+
 #include <cstdint>
 #include <string>
 
@@ -123,8 +126,7 @@ class GTEST_API_ String {
   // Unlike strcasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
+  static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
 
   // Compares two wide C strings, ignoring case.  Returns true if and only if
   // they have the same content.
@@ -143,8 +145,8 @@ class GTEST_API_ String {
 
   // Returns true if and only if the given string ends with the given suffix,
   // ignoring case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
+  static bool EndsWithCaseInsensitive(const std::string& str,
+                                      const std::string& suffix);
 
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
@@ -163,7 +165,7 @@ class GTEST_API_ String {
 
  private:
   String();  // Not meant to be instantiated.
-};  // class String
+};           // class String
 
 // Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
diff --git a/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
index b87a2e2ca..6bc02a7de 100644
--- a/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
+++ b/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
@@ -30,7 +30,9 @@
 // Type utilities needed for implementing typed and type-parameterized
 // tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
@@ -39,11 +41,11 @@
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
@@ -101,7 +103,9 @@ std::string GetTypeName() {
 // A unique type indicating an empty node
 struct None {};
 
-# define GTEST_TEMPLATE_ template <typename T> class
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
 
 // The template "selector" struct TemplateSel<Tmpl> is used to
 // represent Tmpl, which must be a class template with one type
@@ -119,8 +123,7 @@ struct TemplateSel {
   };
 };
 
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
 
 template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
 struct Templates {
diff --git a/third_party/googletest/src/googletest/src/gtest-all.cc b/third_party/googletest/src/googletest/src/gtest-all.cc
index ad292905c..2a70ed88c 100644
--- a/third_party/googletest/src/googletest/src/gtest-all.cc
+++ b/third_party/googletest/src/googletest/src/gtest-all.cc
@@ -38,7 +38,7 @@
 #include "gtest/gtest.h"
 
 // The following lines pull in the real gtest *.cc files.
-#include "src/gtest.cc"
+#include "src/gtest-assertion-result.cc"
 #include "src/gtest-death-test.cc"
 #include "src/gtest-filepath.cc"
 #include "src/gtest-matchers.cc"
@@ -46,3 +46,4 @@
 #include "src/gtest-printers.cc"
 #include "src/gtest-test-part.cc"
 #include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/third_party/googletest/src/googletest/src/gtest-assertion-result.cc b/third_party/googletest/src/googletest/src/gtest-assertion-result.cc
new file mode 100644
index 000000000..f1c0b10dc
--- /dev/null
+++ b/third_party/googletest/src/googletest/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+}  // namespace testing
diff --git a/third_party/googletest/src/googletest/src/gtest-death-test.cc b/third_party/googletest/src/googletest/src/gtest-death-test.cc
index bf4f6331d..e6abc6278 100644
--- a/third_party/googletest/src/googletest/src/gtest-death-test.cc
+++ b/third_party/googletest/src/googletest/src/gtest-death-test.cc
@@ -35,49 +35,49 @@
 #include <functional>
 #include <utility>
 
-#include "gtest/internal/gtest-port.h"
 #include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_HAS_DEATH_TEST
 
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-# if GTEST_OS_FUCHSIA
-#  include <lib/fdio/fd.h>
-#  include <lib/fdio/io.h>
-#  include <lib/fdio/spawn.h>
-#  include <lib/zx/channel.h>
-#  include <lib/zx/port.h>
-#  include <lib/zx/process.h>
-#  include <lib/zx/socket.h>
-#  include <zircon/processargs.h>
-#  include <zircon/syscalls.h>
-#  include <zircon/syscalls/policy.h>
-#  include <zircon/syscalls/port.h>
-# endif  // GTEST_OS_FUCHSIA
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -96,9 +96,12 @@ namespace testing {
 // used internally at Google, is "threadsafe".
 static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
+}  // namespace testing
+
 GTEST_DEFINE_string_(
     death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    testing::internal::StringFromGTestEnv("death_test_style",
+                                          testing::kDefaultDeathTestStyle),
     "Indicates how to run a death test in a forked child process: "
     "\"threadsafe\" (child process re-executes the test binary "
     "from the beginning, running only the specific death test) or "
@@ -107,7 +110,7 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
     "Instructs to use fork()/_exit() instead of clone() in death tests. "
     "Ignored and always uses fork() on POSIX systems where clone() is not "
     "implemented. Useful when running under valgrind or similar tools if "
@@ -117,7 +120,6 @@ GTEST_DEFINE_bool_(
     "work in 99% of the cases. Once valgrind is fixed, this flag will "
     "most likely be removed.");
 
-namespace internal {
 GTEST_DEFINE_string_(
     internal_run_death_test, "",
     "Indicates the file, line number, temporal index of "
@@ -126,7 +128,8 @@ GTEST_DEFINE_string_(
     "the '|' characters.  This flag is specified if and only if the "
     "current process is a sub-process launched for running a thread-safe "
     "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
+
+namespace testing {
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -134,9 +137,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
-# endif
+#endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -144,16 +147,16 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   // On Windows and Fuchsia, death tests are thread-safe regardless of the value
   // of the death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
+  return !GTEST_FLAG_GET(internal_run_death_test).empty();
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+    return !GTEST_FLAG_GET(internal_run_death_test).empty();
   else
     return g_in_fast_death_test_child;
 #endif
@@ -162,40 +165,38 @@ bool InDeathTestChild() {
 }  // namespace internal
 
 // ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
-# else
+#else
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
-#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   {
     bool result;
     if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
       return result;
     }
   }
-#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -206,23 +207,23 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
-# else
+#else
 
   if (WIFEXITED(exit_code)) {
     m << "Exited with exit status " << WEXITSTATUS(exit_code);
   } else if (WIFSIGNALED(exit_code)) {
     m << "Terminated by signal " << WTERMSIG(exit_code);
   }
-#  ifdef WCOREDUMP
+#ifdef WCOREDUMP
   if (WCOREDUMP(exit_code)) {
     m << " (core dumped)";
   }
-#  endif
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -233,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -254,7 +255,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
       << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -304,14 +305,14 @@ static void DeathTestAbort(const std::string& message) {
 
 // A replacement for CHECK that calls DeathTestAbort if the assertion
 // fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -321,23 +322,23 @@ static void DeathTestAbort(const std::string& message) {
 // evaluates the expression as long as it evaluates to -1 and sets
 // errno to EINTR.  If the expression evaluates to -1 but errno is
 // something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
 std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
+  return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -370,8 +371,9 @@ static void FailFromInternalError(int fd) {
 DeathTest::DeathTest() {
   TestInfo* const info = GetUnitTestImpl()->current_test_info();
   if (info == nullptr) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
   }
 }
 
@@ -500,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
   set_read_fd(-1);
 }
 
-std::string DeathTestImpl::GetErrorLogs() {
-  return GetCapturedStderr();
-}
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
 
 // Signals that the death test code which should have exited, didn't.
 // Should be called only in a death test child process.
@@ -512,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
   // The parent process considers the death test to be a failure if
   // it finds any data in our pipe.  So, here we write a single flag byte
   // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+  const char status_ch = reason == TEST_DID_NOT_DIE       ? kDeathTestLived
+                         : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+                                                          : kDeathTestReturned;
 
   GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
   // We are leaking the descriptor here because on some platforms (i.e.,
@@ -533,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) {
 // much easier.
 static ::std::string FormatDeathTestOutput(const ::std::string& output) {
   ::std::string ret;
-  for (size_t at = 0; ; ) {
+  for (size_t at = 0;;) {
     const size_t line_end = output.find('\n', at);
     ret += "[  DEATH   ] ";
     if (line_end == ::std::string::npos) {
@@ -568,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) {
 // the first failing condition, in the order given above, is the one that is
 // reported. Also sets the last death test message string.
 bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
+  if (!spawned()) return false;
 
   const std::string error_message = GetErrorLogs();
 
@@ -580,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) {
   switch (outcome()) {
     case LIVED:
       buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case THREW:
       buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case RETURNED:
       buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case DIED:
       if (status_ok) {
@@ -605,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) {
       } else {
         buffer << "    Result: died but not with expected exit code:\n"
                << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
       }
       break;
     case IN_PROGRESS:
@@ -618,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
   return success;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // WindowsDeathTest implements death tests on Windows. Due to the
 // specifics of starting new processes on Windows, death tests there are
 // always threadsafe, and Google Test considers the
@@ -679,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl {
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Wait until the child either signals that it has acquired the write end
   // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
+  const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+  switch (::WaitForMultipleObjects(2, wait_handles,
                                    FALSE,  // Waits for any of the handles.
                                    INFINITE)) {
     case WAIT_OBJECT_0:
@@ -707,9 +708,8 @@ int WindowsDeathTest::Wait() {
   // returns immediately if the child has already exited, regardless of
   // whether previous calls to WaitForMultipleObjects synchronized on this
   // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
   DWORD status_code;
   GTEST_DEATH_TEST_CHECK_(
       ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -742,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
                                                  nullptr, TRUE};
   HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
   write_handle_.Reset(write_handle);
   event_handle_.Reset(::CreateEvent(
       &handles_are_inheritable,
@@ -756,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       nullptr));  // The even is unnamed.
   GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
   const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
+      std::string("--") + GTEST_FLAG_PREFIX_ +
+      "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+      "|" + StreamableToString(death_test_index) + "|" +
       StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
       // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
   GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
                                                                 executable_path,
                                                                 _MAX_PATH));
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -796,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(
       ::CreateProcessA(
           executable_path, const_cast<char*>(command_line.c_str()),
-          nullptr,  // Retuned process handle is not inheritable.
-          nullptr,  // Retuned thread handle is not inheritable.
+          nullptr,  // Returned process handle is not inheritable.
+          nullptr,  // Returned thread handle is not inheritable.
           TRUE,  // Child inherits all inheritable handles (for write_handle_).
           0x0,   // Default creation flags.
           nullptr,  // Inherit the parent's environment.
@@ -809,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
 class FuchsiaDeathTest : public DeathTestImpl {
  public:
@@ -855,18 +854,13 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
-  int size() {
-    return static_cast<int>(args_.size()) - 1;
-  }
+  int size() { return static_cast<int>(args_.size()) - 1; }
 
  private:
   std::vector<char*> args_;
@@ -880,8 +874,7 @@ int FuchsiaDeathTest::Wait() {
   const int kSocketKey = 1;
   const int kExceptionKey = 2;
 
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Create a port to wait for socket/task/exception events.
   zx_status_t status_zx;
@@ -890,8 +883,8 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the child process to terminate.
-  status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
+  status_zx =
+      child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
@@ -900,8 +893,8 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
-  status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
+  status_zx = exception_channel_.wait_async(port, kExceptionKey,
+                                            ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() {
           size_t old_length = captured_stderr_.length();
           size_t bytes_read = 0;
           captured_stderr_.resize(old_length + kBufferSize);
-          status_zx = stderr_socket_.read(
-              0, &captured_stderr_.front() + old_length, kBufferSize,
-              &bytes_read);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
           captured_stderr_.resize(old_length + bytes_read);
         } while (status_zx == ZX_OK);
         if (status_zx == ZX_ERR_PEER_CLOSED) {
@@ -987,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Build the child process command line.
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|"
-      + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
   Arguments args;
   args.AddArguments(GetInjectableArgvs());
   args.AddArgument(filter_flag.c_str());
@@ -1016,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a socket pair will be used to receive the child process' stderr.
   zx::socket stderr_producer_socket;
-  status =
-      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
   GTEST_DEATH_TEST_CHECK_(status >= 0);
   int stderr_producer_fd = -1;
   status =
@@ -1034,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a child job.
   zx_handle_t child_job = ZX_HANDLE_INVALID;
-  status = zx_job_create(zx_job_default(), 0, & child_job);
+  status = zx_job_create(zx_job_default(), 0, &child_job);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
   zx_policy_basic_t policy;
   policy.condition = ZX_POL_NEW_ANY;
   policy.policy = ZX_POL_ACTION_ALLOW;
-  status = zx_job_set_policy(
-      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Create an exception channel attached to the |child_job|, to allow
   // us to suppress the system default exception handler from firing.
-  status =
-      zx_task_create_exception_channel(
-          child_job, 0, exception_channel_.reset_and_get_address());
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Spawn the child process.
-  status = fdio_spawn_etc(
-      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
-      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   set_spawned(true);
   return OVERSEE_TEST;
 }
 
-std::string FuchsiaDeathTest::GetErrorLogs() {
-  return captured_stderr_;
-}
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
 
 #else  // We are neither on Windows, nor on Fuchsia.
 
@@ -1093,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement,
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   ReadAndInterpretStatusByte();
 
@@ -1173,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest {
  private:
   static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
     ::std::vector<std::string> args = GetInjectableArgvs();
-#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
-#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
@@ -1204,14 +1191,11 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
  private:
   std::vector<char*> args_;
@@ -1224,9 +1208,9 @@ struct ExecDeathTestArgs {
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
 extern "C" char** environ;
-#  else  // GTEST_OS_QNX
+#else   // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -1241,8 +1225,8 @@ static int ExecDeathTestChildMain(void* child_arg) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
@@ -1253,13 +1237,12 @@ static int ExecDeathTestChildMain(void* child_arg) {
   // one path separator.
   execv(args->argv[0], args->argv);
   DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
+                 original_dir + " failed: " + GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // GTEST_OS_QNX
+#endif  // GTEST_OS_QNX
 
-#  if GTEST_HAS_CLONE
+#if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -1293,7 +1276,7 @@ static bool StackGrowsDown() {
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
-#  endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1303,10 +1286,10 @@ static bool StackGrowsDown() {
 // spawn(2) there instead.  The function dies with an error message if
 // anything goes wrong.
 static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
+  ExecDeathTestArgs args = {argv, close_fd};
   pid_t child_pid = -1;
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
   // Obtains the current directory and sets it to be closed in the child
   // process.
   const int cwd_fd = open(".", O_RDONLY);
@@ -1319,16 +1302,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
   int fd_flags;
   // Set close_fd to be closed after spawn.
   GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
   struct inheritance inherit = {0};
   // spawn is a system call.
   child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
@@ -1336,8 +1319,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
 
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   // When a SIGPROF signal is received while fork() or clone() are executing,
   // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
   // it after the call to fork()/clone() is complete.
@@ -1346,12 +1329,12 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
   sigemptyset(&ignore_sigprof_action.sa_mask);
   ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
 
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+#if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
 
   if (!use_fork) {
     static const bool stack_grows_down = StackGrowsDown();
@@ -1370,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
     const size_t kMaxStackAlignment = 64;
     void* const stack_top =
         static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
     GTEST_DEATH_TEST_CHECK_(
         static_cast<size_t>(stack_size) > kMaxStackAlignment &&
         reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
@@ -1379,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#   else
+#else
   const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
+    ExecDeathTestChildMain(&args);
+    _exit(0);
   }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   GTEST_DEATH_TEST_CHECK_SYSCALL_(
       sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-#  endif  // GTEST_OS_LINUX
+#endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1420,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
 
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    "internal_run_death_test=" + file_ + "|" +
+                                    StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
   Arguments args;
   args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
@@ -1447,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // Creates a concrete DeathTest-derived class that depends on the
 // --gtest_death_test_style flag, and sets the pointer pointed to
@@ -1461,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement,
   UnitTestImpl* const impl = GetUnitTestImpl();
   const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
 
   if (flag != nullptr) {
     if (death_test_index > flag->index()) {
       DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
       return false;
     }
 
@@ -1480,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement,
     }
   }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
   }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
   }
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
     *test = new ExecDeathTest(statement, std::move(matcher), file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
+  } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new NoExecDeathTest(statement, std::move(matcher));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG_GET(death_test_style) +
+                                           "\" encountered");
     return false;
   }
 
   return true;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
 static int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
     DeathTestAbort("Unable to open parent process " +
                    StreamableToString(parent_process_id));
@@ -1531,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
   // The newly initialized handle is accessible only in the parent
@@ -1554,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
   HANDLE dup_event_handle;
 
   if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
                          DUPLICATE_SAME_ACCESS)) {
     DeathTestAbort("Unable to duplicate the event handle " +
                    StreamableToString(event_handle_as_size_t) +
@@ -1578,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   return write_fd;
 }
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+  if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
 
   // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
   // can use it here.
   int line = -1;
   int index = -1;
   ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
   int write_fd = -1;
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
   unsigned int parent_process_id = 0;
   size_t write_handle_as_size_t = 0;
   size_t event_handle_as_size_t = 0;
 
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
     DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
                                      event_handle_as_size_t);
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (fields.size() != 3
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# else
+#else
 
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 }
diff --git a/third_party/googletest/src/googletest/src/gtest-filepath.cc b/third_party/googletest/src/googletest/src/gtest-filepath.cc
index 0b5629401..f6ee90cdb 100644
--- a/third_party/googletest/src/googletest/src/gtest-filepath.cc
+++ b/third_party/googletest/src/googletest/src/gtest-filepath.cc
@@ -30,29 +30,31 @@
 #include "gtest/internal/gtest-filepath.h"
 
 #include <stdlib.h>
-#include "gtest/internal/gtest-port.h"
+
 #include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
+#include <windows.h>
 #elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
+#include <direct.h>
+#include <io.h>
 #else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
+#include <limits.h>
+
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
 
 #include "gtest/internal/gtest-string.h"
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
+#define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
+#define GTEST_PATH_MAX_ PATH_MAX
 #elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
 #else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
 namespace testing {
@@ -66,16 +68,16 @@ namespace internal {
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
 const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
 // the current directory in tests on Windows CE, but this at least
 // provides a reasonable fallback.
 const char kCurrentDirectoryString[] = "\\";
 // Windows CE doesn't define INVALID_FILE_ATTRIBUTES
 const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
+#else
 const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
 const char kCurrentDirectoryString[] = "./";
@@ -99,17 +101,17 @@ FilePath FilePath::GetCurrentDir() {
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
 #elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
 #else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
+#if GTEST_OS_NACL
   // getcwd will likely fail in NaCl due to the sandbox, so return something
   // reasonable. The user may have provided a shim implementation for getcwd,
   // however, so fallback only when failure is detected.
   return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-# endif  // GTEST_OS_NACL
+#endif  // GTEST_OS_NACL
   return FilePath(result == nullptr ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -121,8 +123,8 @@ FilePath FilePath::GetCurrentDir() {
 FilePath FilePath::RemoveExtension(const char* extension) const {
   const std::string dot_extension = std::string(".") + extension;
   if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
@@ -178,15 +180,14 @@ FilePath FilePath::RemoveFileName() const {
 // than zero (e.g., 12), returns "dir/test_12.xml".
 // On Windows platform, uses \ as the separator rather than /.
 FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
+                                const FilePath& base_name, int number,
                                 const char* extension) {
   std::string file;
   if (number == 0) {
     file = base_name.string() + "." + extension;
   } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
@@ -195,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory,
 // On Windows, uses \ as the separator rather than /.
 FilePath FilePath::ConcatPaths(const FilePath& directory,
                                const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
+  if (directory.IsEmpty()) return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
   return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
@@ -207,7 +207,7 @@ bool FilePath::FileOrDirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   return attributes != kInvalidFileAttributes;
 #else
   posix::StatStruct file_stat{};
@@ -222,8 +222,8 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS
   // Don't strip off trailing separator if path is a root directory on
   // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
+  const FilePath& path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
 #else
   const FilePath& path(*this);
 #endif
@@ -231,15 +231,15 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   if ((attributes != kInvalidFileAttributes) &&
       (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
     result = true;
   }
 #else
   posix::StatStruct file_stat{};
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
   return result;
@@ -260,10 +260,9 @@ bool FilePath::IsAbsolutePath() const {
   const char* const name = pathname_.c_str();
 #if GTEST_OS_WINDOWS
   return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
 #else
   return IsPathSeparator(name[0]);
 #endif
@@ -321,7 +320,7 @@ bool FilePath::CreateFolder() const {
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
   int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
-  delete [] unicode;
+  delete[] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
 #elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
@@ -341,9 +340,8 @@ bool FilePath::CreateFolder() const {
 // name, otherwise return the name string unmodified.
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
 }
 
 // Removes any redundant separators that might be in the pathname.
diff --git a/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/third_party/googletest/src/googletest/src/gtest-internal-inl.h
index 6d8cecbbb..0b9e929c6 100644
--- a/third_party/googletest/src/googletest/src/gtest-internal-inl.h
+++ b/third_party/googletest/src/googletest/src/gtest-internal-inl.h
@@ -35,7 +35,7 @@
 #define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
 #endif  // !_WIN32_WCE
 #include <stddef.h>
 #include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
@@ -50,22 +50,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
 #endif
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
 
-#include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
 // Declares the flags.
 //
 // We don't want the users to modify this flag in the code, but want
@@ -73,32 +71,13 @@ namespace testing {
 // declare it here as opposed to in gtest.h.
 GTEST_DECLARE_bool_(death_test_use_fork);
 
+namespace testing {
 namespace internal {
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
 GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
 
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFailFast[] = "fail_fast";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kBriefFlag[] = "brief";
-const char kPrintTimeFlag[] = "print_time";
-const char kPrintUTF8Flag[] = "print_utf8";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-const char kFlagfileFlag[] = "flagfile";
-
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
 
@@ -125,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, int32_t* value);
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
 
 // Returns a random seed in range [1, kMaxRandomSeed] based on the
 // given --gtest_random_seed flag value.
 inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
 
   // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
   // it's easy to type.
   const int normalized_seed =
       static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
   return normalized_seed;
 }
 
@@ -160,50 +139,54 @@ class GTestFlagSaver {
  public:
   // The c'tor.
   GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    fail_fast_ = GTEST_FLAG(fail_fast);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    brief_ = GTEST_FLAG(brief);
-    print_time_ = GTEST_FLAG(print_time);
-    print_utf8_ = GTEST_FLAG(print_utf8);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+    also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+    color_ = GTEST_FLAG_GET(color);
+    death_test_style_ = GTEST_FLAG_GET(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG_GET(fail_fast);
+    filter_ = GTEST_FLAG_GET(filter);
+    internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+    list_tests_ = GTEST_FLAG_GET(list_tests);
+    output_ = GTEST_FLAG_GET(output);
+    brief_ = GTEST_FLAG_GET(brief);
+    print_time_ = GTEST_FLAG_GET(print_time);
+    print_utf8_ = GTEST_FLAG_GET(print_utf8);
+    random_seed_ = GTEST_FLAG_GET(random_seed);
+    repeat_ = GTEST_FLAG_GET(repeat);
+    recreate_environments_when_repeating_ =
+        GTEST_FLAG_GET(recreate_environments_when_repeating);
+    shuffle_ = GTEST_FLAG_GET(shuffle);
+    stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
   }
 
   // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
   ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(fail_fast) = fail_fast_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(brief) = brief_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(print_utf8) = print_utf8_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+    GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+    GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+    GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+    GTEST_FLAG_SET(color, color_);
+    GTEST_FLAG_SET(death_test_style, death_test_style_);
+    GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+    GTEST_FLAG_SET(filter, filter_);
+    GTEST_FLAG_SET(fail_fast, fail_fast_);
+    GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+    GTEST_FLAG_SET(list_tests, list_tests_);
+    GTEST_FLAG_SET(output, output_);
+    GTEST_FLAG_SET(brief, brief_);
+    GTEST_FLAG_SET(print_time, print_time_);
+    GTEST_FLAG_SET(print_utf8, print_utf8_);
+    GTEST_FLAG_SET(random_seed, random_seed_);
+    GTEST_FLAG_SET(repeat, repeat_);
+    GTEST_FLAG_SET(recreate_environments_when_repeating,
+                   recreate_environments_when_repeating_);
+    GTEST_FLAG_SET(shuffle, shuffle_);
+    GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+    GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+    GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
   }
 
  private:
@@ -224,6 +207,7 @@ class GTestFlagSaver {
   bool print_utf8_;
   int32_t random_seed_;
   int32_t repeat_;
+  bool recreate_environments_when_repeating_;
   bool shuffle_;
   int32_t stack_trace_depth_;
   std::string stream_result_to_;
@@ -278,8 +262,8 @@ GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
 // returns true if and only if the test should be run on this shard. The test id
 // is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
 
 // STL container utilities.
 
@@ -290,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) {
   // Implemented as an explicit loop since std::count_if() in libCstd on
   // Solaris has a non-standard signature.
   int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
+  for (auto it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it)) ++count;
   }
   return count;
 }
@@ -441,7 +424,9 @@ class OsStackTraceGetterInterface {
   static const char* const kElidedFramesMarker;
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+  OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+  OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+      delete;
 };
 
 // A working implementation of the OsStackTraceGetterInterface interface.
@@ -463,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
   void* caller_frame_ = nullptr;
 #endif  // GTEST_HAS_ABSL
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+  OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+  OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
 };
 
 // Information about a Google Test trace point.
@@ -476,7 +462,7 @@ struct TraceInfo {
 // This is the default global test part result reporter used in UnitTestImpl.
 // This class should only be used by UnitTestImpl.
 class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
+    : public TestPartResultReporterInterface {
  public:
   explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
@@ -486,7 +472,10 @@ class DefaultGlobalTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+  DefaultGlobalTestPartResultReporter(
+      const DefaultGlobalTestPartResultReporter&) = delete;
+  DefaultGlobalTestPartResultReporter& operator=(
+      const DefaultGlobalTestPartResultReporter&) = delete;
 };
 
 // This is the default per thread test part result reporter used in
@@ -502,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+  DefaultPerThreadTestPartResultReporter(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
+  DefaultPerThreadTestPartResultReporter& operator=(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
 };
 
 // The private implementation of the UnitTest class.  We don't protect
@@ -640,7 +632,8 @@ class GTEST_API_ UnitTestImpl {
   // For example, if Foo() calls Bar(), which in turn calls
   // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
   // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+  std::string CurrentOsStackTraceExceptTop(int skip_count)
+      GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
 
   // Finds and returns a TestSuite with the given name.  If one doesn't
   // exist, creates one and returns it.
@@ -744,9 +737,7 @@ class GTEST_API_ UnitTestImpl {
   }
 
   // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
 
   // Adds a TestProperty to the current TestResult object when invoked in a
   // context of a test or a test suite, or to the global property set. If the
@@ -754,10 +745,7 @@ class GTEST_API_ UnitTestImpl {
   // updated.
   void RecordProperty(const TestProperty& test_property);
 
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
 
   // Matches the full name of each test against the user-specified
   // filter to decide whether the test should run, then records the
@@ -963,7 +951,8 @@ class GTEST_API_ UnitTestImpl {
   // starts.
   bool catch_exceptions_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+  UnitTestImpl(const UnitTestImpl&) = delete;
+  UnitTestImpl& operator=(const UnitTestImpl&) = delete;
 };  // class UnitTestImpl
 
 // Convenience function for accessing the global UnitTest
@@ -986,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch);
 GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
 GTEST_API_ bool ValidateRegex(const char* regex);
 GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char* regex,
+                                              const char* str);
 GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
 
 #endif  // GTEST_USES_SIMPLE_RE
@@ -1089,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     ~SocketWriter() override {
-      if (sockfd_ != -1)
-        CloseConnection();
+      if (sockfd_ != -1) CloseConnection();
     }
 
     // Sends a string to the socket.
@@ -1100,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener {
 
       const auto len = static_cast<size_t>(message.length());
       if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
       }
     }
 
@@ -1123,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener {
     const std::string host_name_;
     const std::string port_num_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+    SocketWriter(const SocketWriter&) = delete;
+    SocketWriter& operator=(const SocketWriter&) = delete;
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
@@ -1135,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener {
   }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
+      : socket_writer_(socket_writer) {
+    Start();
+  }
 
   void OnTestProgramStart(const UnitTest& /* unit_test */) override {
     SendLn("event=TestProgramStart");
@@ -1158,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestIterationEnd(const UnitTest& unit_test,
                           int /* iteration */) override {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
   // "case" for compatibility
-  void OnTestCaseStart(const TestCase& test_case) override {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  void OnTestSuiteStart(const TestSuite& test_suite) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
   // "case" for compatibility
-  void OnTestCaseEnd(const TestCase& test_case) override {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
-           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+  void OnTestSuiteEnd(const TestSuite& test_suite) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
            "ms");
   }
 
@@ -1183,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestEnd(const TestInfo& test_info) override {
     SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
            StreamableToString((test_info.result())->elapsed_time()) + "ms");
   }
 
@@ -1208,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener {
 
   const std::unique_ptr<AbstractSocketWriter> socket_writer_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+  StreamingListener(const StreamingListener&) = delete;
+  StreamingListener& operator=(const StreamingListener&) = delete;
 };  // class StreamingListener
 
 #endif  // GTEST_CAN_STREAM_RESULTS_
diff --git a/third_party/googletest/src/googletest/src/gtest-matchers.cc b/third_party/googletest/src/googletest/src/gtest-matchers.cc
index 65104ebab..7e3bcc0cf 100644
--- a/third_party/googletest/src/googletest/src/gtest-matchers.cc
+++ b/third_party/googletest/src/googletest/src/gtest-matchers.cc
@@ -32,12 +32,13 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-matchers.h"
 
 #include <string>
 
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
 namespace testing {
 
 // Constructs a matcher that matches a const std::string& whose value is
diff --git a/third_party/googletest/src/googletest/src/gtest-port.cc b/third_party/googletest/src/googletest/src/gtest-port.cc
index 53a4d37f9..d797fe4d5 100644
--- a/third_party/googletest/src/googletest/src/gtest-port.cc
+++ b/third_party/googletest/src/googletest/src/gtest-port.cc
@@ -27,61 +27,62 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/internal/gtest-port.h"
 
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include <cstdint>
 #include <fstream>
 #include <memory>
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map>  // Used in ThreadLocal.
-# ifdef _MSC_VER
-#  include <crtdbg.h>
-# endif  // _MSC_VER
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
 #else
-# include <unistd.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
 #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
     GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# include <sys/sysctl.h>
-# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#  include <sys/user.h>
-# endif
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
 #endif
 
 #if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
 #if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
+#include <procinfo.h>
+#include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
 #if GTEST_OS_FUCHSIA
-# include <zircon/process.h>
-# include <zircon/syscalls.h>
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
 #endif  // GTEST_OS_FUCHSIA
 
-#include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 #include "src/gtest-internal-inl.h"
@@ -89,16 +90,7 @@
 namespace testing {
 namespace internal {
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_LINUX
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
 
 namespace {
 template <typename T>
@@ -131,8 +123,7 @@ size_t GetThreadCount() {
   if (status == KERN_SUCCESS) {
     // task_threads allocates resources in thread_list and we need to free them
     // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
                   sizeof(thread_t) * thread_count);
     return static_cast<size_t>(thread_count);
   } else {
@@ -141,7 +132,7 @@ size_t GetThreadCount() {
 }
 
 #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-      GTEST_OS_NETBSD
+    GTEST_OS_NETBSD
 
 #if GTEST_OS_NETBSD
 #undef KERN_PROC
@@ -184,12 +175,12 @@ size_t GetThreadCount() {
 // we cannot detect it.
 size_t GetThreadCount() {
   int mib[] = {
-    CTL_KERN,
-    KERN_PROC,
-    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
-    getpid(),
-    sizeof(struct kinfo_proc),
-    0,
+      CTL_KERN,
+      KERN_PROC,
+      KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+      getpid(),
+      sizeof(struct kinfo_proc),
+      0,
   };
   u_int miblen = sizeof(mib) / sizeof(mib[0]);
 
@@ -210,8 +201,7 @@ size_t GetThreadCount() {
   // exclude empty members
   size_t nthreads = 0;
   for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
-    if (info[i].p_tid != -1)
-      nthreads++;
+    if (info[i].p_tid != -1) nthreads++;
   }
   return nthreads;
 }
@@ -254,13 +244,9 @@ size_t GetThreadCount() {
 size_t GetThreadCount() {
   int dummy_buffer;
   size_t avail;
-  zx_status_t status = zx_object_get_info(
-      zx_process_self(),
-      ZX_INFO_PROCESS_THREADS,
-      &dummy_buffer,
-      0,
-      nullptr,
-      &avail);
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
   if (status == ZX_OK) {
     return avail;
   } else {
@@ -280,27 +266,15 @@ size_t GetThreadCount() {
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
-void SleepMilliseconds(int n) {
-  ::Sleep(static_cast<DWORD>(n));
-}
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
 
-AutoHandle::AutoHandle()
-    : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
 
-AutoHandle::AutoHandle(Handle handle)
-    : handle_(handle) {}
+AutoHandle::~AutoHandle() { Reset(); }
 
-AutoHandle::~AutoHandle() {
-  Reset();
-}
-
-AutoHandle::Handle AutoHandle::Get() const {
-  return handle_;
-}
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
 
-void AutoHandle::Reset() {
-  Reset(INVALID_HANDLE_VALUE);
-}
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
 
 void AutoHandle::Reset(HANDLE handle) {
   // Resetting with the same handle we already own is invalid.
@@ -312,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) {
   } else {
     GTEST_CHECK_(!IsCloseable())
         << "Resetting a valid handle to itself is likely a programmer error "
-            "and thus not allowed.";
+           "and thus not allowed.";
   }
 }
 
@@ -322,23 +296,6 @@ bool AutoHandle::IsCloseable() const {
   return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
 }
 
-Notification::Notification()
-    : event_(::CreateEvent(nullptr,     // Default security attributes.
-                           TRUE,        // Do not reset automatically.
-                           FALSE,       // Initially unset.
-                           nullptr)) {  // Anonymous event.
-  GTEST_CHECK_(event_.Get() != nullptr);
-}
-
-void Notification::Notify() {
-  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
-
-void Notification::WaitForNotification() {
-  GTEST_CHECK_(
-      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
-}
-
 Mutex::Mutex()
     : owner_thread_id_(0),
       type_(kDynamic),
@@ -391,25 +348,25 @@ namespace {
 //    MemoryIsNotDeallocated memory_is_not_deallocated;
 //    critical_section_ = new CRITICAL_SECTION;
 //
-class MemoryIsNotDeallocated
-{
+class MemoryIsNotDeallocated {
  public:
   MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
     old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
     // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
     // doesn't report mem leak if there's no matching deallocation.
-    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
   }
 
   ~MemoryIsNotDeallocated() {
     // Restore the original _CRTDBG_ALLOC_MEM_DF flag
-    _CrtSetDbgFlag(old_crtdbg_flag_);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_);
   }
 
  private:
   int old_crtdbg_flag_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+  MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+  MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
 };
 #endif  // _MSC_VER
 
@@ -435,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() {
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(
-                          &critical_section_init_phase_, 2L, 1L) ==
-                      1L);
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
         break;
       case 1:
         // Somebody else is already initializing the mutex; spin until they
         // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_,
-                                            2L,
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
                                             2L) != 2L) {
           // Possibly yields the rest of the thread's time slice to other
           // threads.
@@ -488,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
  private:
   struct ThreadMainParam {
     ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
-        : runnable_(runnable),
-          thread_can_start_(thread_can_start) {
-    }
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
     std::unique_ptr<Runnable> runnable_;
     // Does not own.
     Notification* thread_can_start_;
@@ -508,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
   // Prohibit instantiation.
   ThreadWithParamSupport();
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+  ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+  ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
 };
 
 }  // namespace
 
-ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
                                          Notification* thread_can_start)
-      : thread_(ThreadWithParamSupport::CreateThread(runnable,
-                                                     thread_can_start)) {
-}
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
 
-ThreadWithParamBase::~ThreadWithParamBase() {
-  Join();
-}
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
 
 void ThreadWithParamBase::Join() {
   GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -548,8 +499,10 @@ class ThreadLocalRegistryImpl {
     ThreadIdToThreadLocals::iterator thread_local_pos =
         thread_to_thread_locals->find(current_thread);
     if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos = thread_to_thread_locals->insert(
-          std::make_pair(current_thread, ThreadLocalValues())).first;
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
       StartWatcherThreadFor(current_thread);
     }
     ThreadLocalValues& thread_local_values = thread_local_pos->second;
@@ -577,9 +530,8 @@ class ThreadLocalRegistryImpl {
       ThreadIdToThreadLocals* const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       for (ThreadIdToThreadLocals::iterator it =
-          thread_to_thread_locals->begin();
-          it != thread_to_thread_locals->end();
-          ++it) {
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
         ThreadLocalValues& thread_local_values = it->second;
         ThreadLocalValues::iterator value_pos =
             thread_local_values.find(thread_local_instance);
@@ -609,9 +561,8 @@ class ThreadLocalRegistryImpl {
       if (thread_local_pos != thread_to_thread_locals->end()) {
         ThreadLocalValues& thread_local_values = thread_local_pos->second;
         for (ThreadLocalValues::iterator value_pos =
-            thread_local_values.begin();
-            value_pos != thread_local_values.end();
-            ++value_pos) {
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
           value_holders.push_back(value_pos->second);
         }
         thread_to_thread_locals->erase(thread_local_pos);
@@ -637,9 +588,8 @@ class ThreadLocalRegistryImpl {
   static void StartWatcherThreadFor(DWORD thread_id) {
     // The returned handle will be kept in thread_map and closed by
     // watcher_thread in WatcherThreadFunc.
-    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
-                                 FALSE,
-                                 thread_id);
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
     GTEST_CHECK_(thread != nullptr);
     // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
@@ -650,7 +600,8 @@ class ThreadLocalRegistryImpl {
         &ThreadLocalRegistryImpl::WatcherThreadFunc,
         reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
         CREATE_SUSPENDED, &watcher_thread_id);
-    GTEST_CHECK_(watcher_thread != nullptr);
+    GTEST_CHECK_(watcher_thread != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
     // Give the watcher thread the same priority as ours to avoid being
     // blocked by it.
     ::SetThreadPriority(watcher_thread,
@@ -664,8 +615,7 @@ class ThreadLocalRegistryImpl {
   static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
     const ThreadIdAndHandle* tah =
         reinterpret_cast<const ThreadIdAndHandle*>(param);
-    GTEST_CHECK_(
-        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
     OnThreadExit(tah->first);
     ::CloseHandle(tah->second);
     delete tah;
@@ -689,16 +639,17 @@ class ThreadLocalRegistryImpl {
 };
 
 Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+    Mutex::kStaticMutex);  // NOLINT
 
 ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
       thread_local_instance);
 }
 
 void ThreadLocalRegistry::OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
 }
 
@@ -786,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
 bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
 bool IsAsciiWordChar(char ch) {
   return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
+         ('0' <= ch && ch <= '9') || ch == '_';
 }
 
 // Returns true if and only if "\\c" is a supported escape sequence.
@@ -799,17 +750,28 @@ bool IsValidEscape(char c) {
 bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
   if (escaped) {  // "\\p" where p is pattern_char.
     switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
+      case 'd':
+        return IsAsciiDigit(ch);
+      case 'D':
+        return !IsAsciiDigit(ch);
+      case 'f':
+        return ch == '\f';
+      case 'n':
+        return ch == '\n';
+      case 'r':
+        return ch == '\r';
+      case 's':
+        return IsAsciiWhiteSpace(ch);
+      case 'S':
+        return !IsAsciiWhiteSpace(ch);
+      case 't':
+        return ch == '\t';
+      case 'v':
+        return ch == '\v';
+      case 'w':
+        return IsAsciiWordChar(ch);
+      case 'W':
+        return !IsAsciiWordChar(ch);
     }
     return IsAsciiPunct(pattern_char) && pattern_char == ch;
   }
@@ -820,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 // Helper function used by ValidateRegex() to format error messages.
 static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
 }
 
 // Generates non-fatal failures and returns false if regex is invalid;
@@ -862,12 +825,12 @@ bool ValidateRegex(const char* regex) {
                       << "'$' can only appear at the end.";
         is_valid = false;
       } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
         is_valid = false;
       } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
         is_valid = false;
       }
 
@@ -885,12 +848,10 @@ bool ValidateRegex(const char* regex) {
 // characters to be indexable by size_t, in which case the test will
 // probably time out anyway.  We are fine with this limitation as
 // std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char* regex, const char* str) {
   const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
   // We cannot call numeric_limits::max() as it conflicts with the
   // max() macro on Windows.
 
@@ -903,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead(
       // greedy match.
       return true;
     }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
   }
   return false;
 }
@@ -918,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 
   // "$" only matches the end of a string.  Note that regex being
   // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
+  if (*regex == '$') return *str == '\0';
 
   // Is the first thing in regex an escape sequence?
   const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
+  if (escaped) ++regex;
   if (IsRepeat(regex[1])) {
     // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
     // here's an indirect recursion.  It terminates as the regex gets
     // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
   } else {
     // regex isn't empty, isn't "$", and doesn't start with a
     // repetition.  We match the first atom of regex with the first
     // character of str and recurse.
     return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
+           MatchRegexAtHead(regex + 1, str + 1);
   }
 }
 
@@ -951,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 bool MatchRegexAnywhere(const char* regex, const char* str) {
   if (regex == nullptr || str == nullptr) return false;
 
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
 
   // A successful match can be anywhere in str.
   do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
+    if (MatchRegexAtHead(regex, str)) return true;
   } while (*str++ != '\0');
   return false;
 }
@@ -1038,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
 // FormatFileLocation in order to contrast the two functions.
 // Note that FormatCompilerIndependentFileLocation() does NOT append colon
 // to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line) {
   const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0)
@@ -1050,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
 
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
+  const char* const marker = severity == GTEST_INFO      ? "[  INFO ]"
+                             : severity == GTEST_WARNING ? "[WARNING]"
+                             : severity == GTEST_ERROR   ? "[ ERROR ]"
+                                                         : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
 }
 
 // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -1078,27 +1035,26 @@ class CapturedStream {
  public:
   // The ctor redirects the stream to a temporary file.
   explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = {'\0'};   // NOLINT
+    char temp_file_path[MAX_PATH + 1] = {'\0'};  // NOLINT
 
     ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
                                             0,  // Generate unique file name.
                                             temp_file_path);
     GTEST_CHECK_(success != 0)
         << "Unable to create a temporary file in " << temp_dir_path;
     const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
     filename_ = temp_file_path;
-# else
+#else
     // There's no guarantee that a test has write access to the current
     // directory, so we create the temporary file in a temporary directory.
     std::string name_template;
 
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
     // the location of the world-writable SD Card directory. However,
@@ -1111,7 +1067,7 @@ class CapturedStream {
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
     name_template = "/data/local/tmp/";
-#  elif GTEST_OS_IOS
+#elif GTEST_OS_IOS
     char user_temp_dir[PATH_MAX + 1];
 
     // Documented alternative to NSTemporaryDirectory() (for obtaining creating
@@ -1132,9 +1088,9 @@ class CapturedStream {
     name_template = user_temp_dir;
     if (name_template.back() != GTEST_PATH_SEP_[0])
       name_template.push_back(GTEST_PATH_SEP_[0]);
-#  else
+#else
     name_template = "/tmp/";
-#  endif
+#endif
     name_template.append("gtest_captured_stream.XXXXXX");
 
     // mkstemp() modifies the string bytes in place, and does not go beyond the
@@ -1150,15 +1106,13 @@ class CapturedStream {
           << " for test; does the test have access to the /tmp directory?";
     }
     filename_ = std::move(name_template);
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
     close(captured_fd);
   }
 
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
+  ~CapturedStream() { remove(filename_.c_str()); }
 
   std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
@@ -1185,7 +1139,8 @@ class CapturedStream {
   // Name of the temporary file holding the stderr output.
   ::std::string filename_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+  CapturedStream(const CapturedStream&) = delete;
+  CapturedStream& operator=(const CapturedStream&) = delete;
 };
 
 GTEST_DISABLE_MSC_DEPRECATED_POP_()
@@ -1213,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) {
   return content;
 }
 
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+
 // Starts capturing stdout.
 void CaptureStdout() {
   CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
@@ -1235,10 +1199,6 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-
-
-
-
 size_t GetFileSize(FILE* file) {
   fseek(file, 0, SEEK_END);
   return static_cast<size_t>(ftell(file));
@@ -1256,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) {
   // Keeps reading the file until we cannot read further or the
   // pre-determined file size is reached.
   do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
     bytes_read += bytes_last_read;
   } while (bytes_last_read > 0 && bytes_read < file_size);
 
@@ -1344,7 +1305,7 @@ bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
       // The parsed value overflows as an int32_t.
-      ) {
+  ) {
     Message msg;
     msg << "WARNING: " << src_text
         << " is expected to be a 32-bit integer, but actually"
@@ -1388,8 +1349,8 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
   }
 
   int32_t result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
     printf("The default value %s is used.\n",
            (Message() << default_value).GetString().c_str());
     fflush(stdout);
@@ -1408,7 +1369,7 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
 // not check that the flag is 'output'
 // In essence this checks an env variable called XML_OUTPUT_FILE
 // and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar(){
+std::string OutputFlagAlsoCheckEnvVar() {
   std::string default_value_for_output_flag = "";
   const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
   if (nullptr != xml_output_file_env) {
diff --git a/third_party/googletest/src/googletest/src/gtest-printers.cc b/third_party/googletest/src/googletest/src/gtest-printers.cc
index 1b68fcb50..f3976d230 100644
--- a/third_party/googletest/src/googletest/src/gtest-printers.cc
+++ b/third_party/googletest/src/googletest/src/gtest-printers.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -101,7 +100,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
     PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
     *os << " ... ";
     // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
     PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
   }
   *os << ">";
@@ -136,11 +135,7 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
 //   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
 
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
@@ -213,35 +208,21 @@ static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   }
 }
 
-static const char* GetCharWidthPrefix(char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(char) { return ""; }
 
-static const char* GetCharWidthPrefix(signed char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(signed char) { return ""; }
 
-static const char* GetCharWidthPrefix(unsigned char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
 
 #ifdef __cpp_char8_t
-static const char* GetCharWidthPrefix(char8_t) {
-  return "u8";
-}
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
 #endif
 
-static const char* GetCharWidthPrefix(char16_t) {
-  return "u";
-}
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
 
-static const char* GetCharWidthPrefix(char32_t) {
-  return "U";
-}
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
 
-static const char* GetCharWidthPrefix(wchar_t) {
-  return "L";
-}
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
 
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
@@ -276,8 +257,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
   // To aid user debugging, we also print c's code in decimal, unless
   // it's 0 (in which case c was printed as '\\0', making the code
   // obvious).
-  if (c == 0)
-    return;
+  if (c == 0) return;
   *os << " (" << static_cast<int>(c);
 
   // For more convenience, we print c's code again in hexadecimal,
@@ -304,17 +284,60 @@ void PrintTo(char32_t c, ::std::ostream* os) {
       << static_cast<uint32_t>(c);
 }
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+  if (v == 0) {
+    *os << "0";
+    return;
+  }
+
+  // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+  char buf[40];
+  char* p = buf + sizeof(buf);
+
+  // Some configurations have a __uint128_t, but no support for built in
+  // division. Do manual long division instead.
+
+  uint64_t high = static_cast<uint64_t>(v >> 64);
+  uint64_t low = static_cast<uint64_t>(v);
+
+  *--p = 0;
+  while (high != 0 || low != 0) {
+    uint64_t high_mod = high % 10;
+    high = high / 10;
+    // This is the long division algorithm specialized for a divisor of 10 and
+    // only two elements.
+    // Notable values:
+    //   2^64 / 10 == 1844674407370955161
+    //   2^64 % 10 == 6
+    const uint64_t carry = 6 * high_mod + low % 10;
+    low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+    char digit = static_cast<char>(carry % 10);
+    *--p = '0' + digit;
+  }
+  *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+  __uint128_t uv = static_cast<__uint128_t>(v);
+  if (v < 0) {
+    *os << "-";
+    uv = -uv;
+  }
+  PrintTo(uv, os);
+}
+#endif  // __SIZEOF_INT128__
+
 // Prints the given array of characters to the ostream.  CharType must be either
 // char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static CharFormat PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
   const char* const quote_prefix = GetCharWidthPrefix(*begin);
   *os << quote_prefix << "\"";
   bool is_previous_hex = false;
@@ -340,12 +363,11 @@ static CharFormat PrintCharsAsStringTo(
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
 // 'begin'.  CharType must be either char or wchar_t.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType* begin, size_t len,
+                                ostream* os) {
   // The code
   //   const char kFoo[] = "foo";
   // generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -436,28 +458,28 @@ void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 namespace {
 
 bool ContainsUnprintableControlCodes(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length; i++) {
     unsigned char ch = *s++;
     if (std::iscntrl(ch)) {
-        switch (ch) {
+      switch (ch) {
         case '\t':
         case '\n':
         case '\r':
           break;
         default:
           return true;
-        }
       }
+    }
   }
   return false;
 }
 
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
 
 bool IsValidUTF8(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length;) {
     unsigned char lead = s[i++];
@@ -470,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) {
     } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
       ++i;  // 2-byte character
     } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                // check for non-shortest form and surrogate
                (lead != 0xe0 || s[i] >= 0xa0) &&
                (lead != 0xed || s[i] < 0xa0)) {
       i += 2;  // 3-byte character
     } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                IsUTF8TrailByte(s[i + 2]) &&
                // check for non-shortest form
                (lead != 0xf0 || s[i] >= 0x90) &&
@@ -502,7 +522,7 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
   if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
-    if (GTEST_FLAG(print_utf8)) {
+    if (GTEST_FLAG_GET(print_utf8)) {
       ConditionalPrintAsText(s.data(), s.size(), os);
     }
   }
diff --git a/third_party/googletest/src/googletest/src/gtest-test-part.cc b/third_party/googletest/src/googletest/src/gtest-test-part.cc
index a938683ce..eb7c8d1cf 100644
--- a/third_party/googletest/src/googletest/src/gtest-test-part.cc
+++ b/third_party/googletest/src/googletest/src/gtest-test-part.cc
@@ -51,13 +51,11 @@ std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
   return os << internal::FormatFileLocation(result.file_name(),
                                             result.line_number())
             << " "
-            << (result.type() == TestPartResult::kSuccess
-                    ? "Success"
-                    : result.type() == TestPartResult::kSkip
-                          ? "Skipped"
-                          : result.type() == TestPartResult::kFatalFailure
-                                ? "Fatal failure"
-                                : "Non-fatal failure")
+            << (result.type() == TestPartResult::kSuccess ? "Success"
+                : result.type() == TestPartResult::kSkip  ? "Skipped"
+                : result.type() == TestPartResult::kFatalFailure
+                    ? "Fatal failure"
+                    : "Non-fatal failure")
             << ":\n"
             << result.message() << std::endl;
 }
@@ -86,8 +84,8 @@ namespace internal {
 
 HasNewFatalFailureHelper::HasNewFatalFailureHelper()
     : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
   GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
 }
 
@@ -98,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
 
 void HasNewFatalFailureHelper::ReportTestPartResult(
     const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
   original_reporter_->ReportTestPartResult(result);
 }
 
diff --git a/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/third_party/googletest/src/googletest/src/gtest-typed-test.cc
index c02c3df65..a2828b83c 100644
--- a/third_party/googletest/src/googletest/src/gtest-typed-test.cc
+++ b/third_party/googletest/src/googletest/src/gtest-typed-test.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/gtest-typed-test.h"
 
 #include "gtest/gtest.h"
@@ -38,8 +37,7 @@ namespace internal {
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
 static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
+  while (IsSpace(*str)) str++;
   return str;
 }
 
@@ -85,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
   }
 
   for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end();
-       ++it) {
+       it != registered_tests_.end(); ++it) {
     if (tests.count(it->first) == 0) {
       errors << "You forgot to list test " << it->first << ".\n";
     }
diff --git a/third_party/googletest/src/googletest/src/gtest.cc b/third_party/googletest/src/googletest/src/gtest.cc
index 21c611aff..6f31dd226 100644
--- a/third_party/googletest/src/googletest/src/gtest.cc
+++ b/third_party/googletest/src/googletest/src/gtest.cc
@@ -31,8 +31,6 @@
 // The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest.h"
-#include "gtest/internal/custom/gtest.h"
-#include "gtest/gtest-spi.h"
 
 #include <ctype.h>
 #include <stdarg.h>
@@ -46,79 +44,87 @@
 #include <chrono>  // NOLINT
 #include <cmath>
 #include <cstdint>
+#include <initializer_list>
 #include <iomanip>
+#include <iterator>
 #include <limits>
 #include <list>
 #include <map>
 #include <ostream>  // NOLINT
 #include <sstream>
+#include <unordered_set>
 #include <vector>
 
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
+
 #if GTEST_OS_LINUX
 
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
 // Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#include <string>
 
 #elif GTEST_OS_ZOS
-# include <sys/time.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
+#include <strings.h>   // NOLINT
 
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #ifdef _MSC_VER
-# include <crtdbg.h>  // NOLINT
+#include <crtdbg.h>  // NOLINT
 #endif
 
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
+#include <io.h>         // NOLINT
+#include <sys/stat.h>   // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
 
-# if GTEST_OS_WINDOWS_MINGW
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
 
 #else
 
 // cpplint thinks that the header is already included, so we want to
 // silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
 
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-# include <sys/socket.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
 #endif
 
 #include "src/gtest-internal-inl.h"
 
 #if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
+#define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
@@ -131,7 +137,10 @@
 #include "absl/debugging/failure_signal_handler.h"
 #include "absl/debugging/stacktrace.h"
 #include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #endif  // GTEST_HAS_ABSL
 
 namespace testing {
@@ -177,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // is specified on the command line.
 bool g_help_flag = false;
 
-// Utilty function to Open File for Writing
+// Utility function to Open File for Writing
 static FILE* OpenFileForWriting(const std::string& output_file) {
   FILE* fileout = nullptr;
   FilePath output_file_path(output_file);
@@ -216,28 +225,33 @@ static bool GetDefaultFailFast() {
   return false;
 }
 
+}  // namespace testing
+
 GTEST_DEFINE_bool_(
-    fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+    fail_fast,
+    testing::internal::BoolFromGTestEnv("fail_fast",
+                                        testing::GetDefaultFailFast()),
     "True if and only if a test failure should stop further test execution.");
 
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
     "Run disabled tests too, in addition to the tests normally being run.");
 
 GTEST_DEFINE_bool_(
-    break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+    break_on_failure,
+    testing::internal::BoolFromGTestEnv("break_on_failure", false),
     "True if and only if a failed assertion should be a debugger "
     "break-point.");
 
 GTEST_DEFINE_bool_(catch_exceptions,
-                   internal::BoolFromGTestEnv("catch_exceptions", true),
+                   testing::internal::BoolFromGTestEnv("catch_exceptions",
+                                                       true),
                    "True if and only if " GTEST_NAME_
                    " should catch exceptions and treat them as test failures.");
 
 GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
+    color, testing::internal::StringFromGTestEnv("color", "auto"),
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
@@ -245,7 +259,8 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_string_(
     filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    testing::internal::StringFromGTestEnv("filter",
+                                          testing::GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
@@ -254,13 +269,14 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     install_failure_signal_handler,
-    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
-    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+                                        false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
     "install a signal handler that dumps debugging information when fatal "
     "signals are raised.");
 
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
 
 // The net priority order after flag processing is thus:
 //   --gtest_output command line flag
@@ -269,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false,
 //   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output",
-      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    testing::internal::StringFromGTestEnv(
+        "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
     "A format (defaults to \"xml\" but can be specified to be \"json\"), "
     "optionally followed by a colon and an output file name or directory. "
     "A directory is indicated by a trailing pathname separator. "
@@ -281,65 +297,79 @@ GTEST_DEFINE_string_(
     "digits.");
 
 GTEST_DEFINE_bool_(
-    brief, internal::BoolFromGTestEnv("brief", false),
+    brief, testing::internal::BoolFromGTestEnv("brief", false),
     "True if only test failures should be displayed in text output.");
 
-GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+GTEST_DEFINE_bool_(print_time,
+                   testing::internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
 
-GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+GTEST_DEFINE_bool_(print_utf8,
+                   testing::internal::BoolFromGTestEnv("print_utf8", true),
                    "True if and only if " GTEST_NAME_
                    " prints UTF8 characters as text.");
 
 GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
+    random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
     "Random number seed to use when shuffling test orders.  Must be in range "
     "[1, 99999], or 0 to use a seed based on the current time.");
 
 GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
+    repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
     "How many times to repeat each test.  Specify a negative number "
     "for repeating forever.  Useful for shaking out flaky tests.");
 
+GTEST_DEFINE_bool_(
+    recreate_environments_when_repeating,
+    testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+                                        false),
+    "Controls whether global test environments are recreated for each repeat "
+    "of the tests. If set to false the global test environments are only set "
+    "up once, for the first iteration, and only torn down once, for the last. "
+    "Useful for shaking out flaky tests with stable, expensive test "
+    "environments. If --gtest_repeat is set to a negative number, meaning "
+    "there is no last run, the environments will always be recreated to avoid "
+    "leaks.");
+
 GTEST_DEFINE_bool_(show_internal_stack_frames, false,
                    "True if and only if " GTEST_NAME_
                    " should include internal stack frames when "
                    "printing test failure stack traces.");
 
-GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+GTEST_DEFINE_bool_(shuffle,
+                   testing::internal::BoolFromGTestEnv("shuffle", false),
                    "True if and only if " GTEST_NAME_
                    " should randomize tests' order on every run.");
 
 GTEST_DEFINE_int32_(
     stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    testing::internal::Int32FromGTestEnv("stack_trace_depth",
+                                         testing::kMaxStackTraceDepth),
     "The maximum number of stack frames to print when an "
     "assertion fails.  The valid range is 0 through 100, inclusive.");
 
 GTEST_DEFINE_string_(
     stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
+    testing::internal::StringFromGTestEnv("stream_result_to", ""),
     "This flag specifies the host name and the port number on which to stream "
     "test results. Example: \"localhost:555\". The flag is effective only on "
     "Linux.");
 
 GTEST_DEFINE_bool_(
     throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
+    testing::internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
     "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
-    flagfile,
-    internal::StringFromGTestEnv("flagfile", ""),
+    flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
     "This flag specifies the flagfile to read command-line flags from.");
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
 namespace internal {
 
 // Generates a random number from [0, range), using a Linear
@@ -348,10 +378,9 @@ namespace internal {
 uint32_t Random::Generate(uint32_t range) {
   // These constants are the same as are used in glibc's rand(3).
   // Use wider types than necessary to prevent unsigned overflow diagnostics.
-  state_ = static_cast<uint32_t>(1103515245ULL*state_ + 12345U) % kMaxRange;
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
 
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
   GTEST_CHECK_(range <= kMaxRange)
       << "Generation of a number in [0, " << range << ") was requested, "
       << "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -396,32 +425,26 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) {
 }
 
 // AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+                           int line, const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
 
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
+AssertHelper::~AssertHelper() { delete data_; }
 
 // Message assignment, for assertion streaming support.
 void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
 }
 
 namespace {
 
 // When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
-// to creates test cases for it, a syntetic test case is
+// to creates test cases for it, a synthetic test case is
 // inserted to report ether an error or a log message.
 //
 // This configuration bit will likely be removed at some point.
@@ -452,7 +475,6 @@ class FailureTest : public Test {
   const bool as_error_;
 };
 
-
 }  // namespace
 
 std::set<std::string>* GetIgnoredParameterizedTestSuites() {
@@ -496,7 +518,8 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
       "To suppress this error for this test suite, insert the following line "
       "(in a non-header) in the namespace it is defined in:"
       "\n\n"
-      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
 
   std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
   RegisterTest(  //
@@ -516,19 +539,18 @@ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
 }
 
 void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
-  GetUnitTestImpl()
-      ->type_parameterized_test_registry()
-      .RegisterInstantiation(case_name);
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
     const char* test_suite_name, CodeLocation code_location) {
   suites_.emplace(std::string(test_suite_name),
-                 TypeParameterizedTestSuiteInfo(code_location));
+                  TypeParameterizedTestSuiteInfo(code_location));
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
-        const char* test_suite_name) {
+    const char* test_suite_name) {
   auto it = suites_.find(std::string(test_suite_name));
   if (it != suites_.end()) {
     it->second.instantiated = true;
@@ -606,7 +628,8 @@ FilePath GetCurrentExecutableName() {
 
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == nullptr)
              ? std::string(gtest_output_flag)
@@ -617,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() {
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
 
   std::string format = GetOutputFormat();
-  if (format.empty())
-    format = std::string(kDefaultOutputFormat);
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == nullptr)
     return internal::FilePath::MakeFileName(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile), 0,
-        format.c_str()).string();
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
@@ -637,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
         internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
         internal::FilePath(colon + 1));
 
-  if (!output_name.IsDirectory())
-    return output_name.string();
+  if (!output_name.IsDirectory()) return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
@@ -699,59 +721,119 @@ static bool PatternMatchesString(const std::string& name_str,
   return true;
 }
 
-bool UnitTestOptions::MatchesFilter(const std::string& name_str,
-                                    const char* filter) {
-  // The filter is a list of patterns separated by colons (:).
-  const char* pattern = filter;
-  while (true) {
-    // Find the bounds of this pattern.
-    const char* const next_sep = strchr(pattern, ':');
-    const char* const pattern_end =
-        next_sep != nullptr ? next_sep : pattern + strlen(pattern);
-
-    // Check if this pattern matches name_str.
-    if (PatternMatchesString(name_str, pattern, pattern_end)) {
-      return true;
-    }
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+  return std::any_of(pattern.begin(), pattern.end(),
+                     [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+  UnitTestFilter() = default;
+
+  // Constructs a filter from a string of patterns separated by `:`.
+  explicit UnitTestFilter(const std::string& filter) {
+    // By design "" filter matches "" string.
+    std::vector<std::string> all_patterns;
+    SplitString(filter, ':', &all_patterns);
+    const auto exact_match_patterns_begin = std::partition(
+        all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+    glob_patterns_.reserve(static_cast<size_t>(
+        std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+    std::move(all_patterns.begin(), exact_match_patterns_begin,
+              std::inserter(glob_patterns_, glob_patterns_.begin()));
+    std::move(
+        exact_match_patterns_begin, all_patterns.end(),
+        std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+  }
+
+  // Returns true if and only if name matches at least one of the patterns in
+  // the filter.
+  bool MatchesName(const std::string& name) const {
+    return exact_match_patterns_.count(name) > 0 ||
+           std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+                       [&name](const std::string& pattern) {
+                         return PatternMatchesString(
+                             name, pattern.c_str(),
+                             pattern.c_str() + pattern.size());
+                       });
+  }
+
+ private:
+  std::vector<std::string> glob_patterns_;
+  std::unordered_set<std::string> exact_match_patterns_;
+};
 
-    // Give up on this pattern. However, if we found a pattern separator (:),
-    // advance to the next pattern (skipping over the separator) and restart.
-    if (next_sep == nullptr) {
-      return false;
+class PositiveAndNegativeUnitTestFilter {
+ public:
+  // Constructs a positive and a negative filter from a string. The string
+  // contains a positive filter optionally followed by a '-' character and a
+  // negative filter. In case only a negative filter is provided the positive
+  // filter will be assumed "*".
+  // A filter is a list of patterns separated by ':'.
+  explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+    std::vector<std::string> positive_and_negative_filters;
+
+    // NOTE: `SplitString` always returns a non-empty container.
+    SplitString(filter, '-', &positive_and_negative_filters);
+    const auto& positive_filter = positive_and_negative_filters.front();
+
+    if (positive_and_negative_filters.size() > 1) {
+      positive_filter_ = UnitTestFilter(
+          positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+      // TODO(b/214626361): Fail on multiple '-' characters
+      // For the moment to preserve old behavior we concatenate the rest of the
+      // string parts with `-` as separator to generate the negative filter.
+      auto negative_filter_string = positive_and_negative_filters[1];
+      for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+        negative_filter_string =
+            negative_filter_string + '-' + positive_and_negative_filters[i];
+      negative_filter_ = UnitTestFilter(negative_filter_string);
+    } else {
+      // In case we don't have a negative filter and positive filter is ""
+      // we do not use kUniversalFilter by design as opposed to when we have a
+      // negative filter.
+      positive_filter_ = UnitTestFilter(positive_filter);
     }
-    pattern = next_sep + 1;
   }
-  return true;
+
+  // Returns true if and only if test name (this is generated by appending test
+  // suit name and test name via a '.' character) matches the positive filter
+  // and does not match the negative filter.
+  bool MatchesTest(const std::string& test_suite_name,
+                   const std::string& test_name) const {
+    return MatchesName(test_suite_name + "." + test_name);
+  }
+
+  // Returns true if and only if name matches the positive filter and does not
+  // match the negative filter.
+  bool MatchesName(const std::string& name) const {
+    return positive_filter_.MatchesName(name) &&
+           !negative_filter_.MatchesName(name);
+  }
+
+ private:
+  UnitTestFilter positive_filter_;
+  UnitTestFilter negative_filter_;
+};
+}  // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  return UnitTestFilter(filter).MatchesName(name_str);
 }
 
 // Returns true if and only if the user-specified filter matches the test
 // suite name and the test name.
 bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
                                         const std::string& test_name) {
-  const std::string& full_name = test_suite_name + "." + test_name.c_str();
-
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == nullptr) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
+  return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+      .MatchesTest(test_suite_name, test_name);
 }
 
 #if GTEST_HAS_SEH
@@ -771,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 
   bool should_handle = true;
 
-  if (!GTEST_FLAG(catch_exceptions))
+  if (!GTEST_FLAG_GET(catch_exceptions))
     should_handle = false;
   else if (exception_code == EXCEPTION_BREAKPOINT)
     should_handle = false;
@@ -789,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 // results. Intercepts only failures from the current thread.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
   Init();
 }
 
@@ -799,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
 // results.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
+    : intercept_mode_(intercept_mode), result_(result) {
   Init();
 }
 
@@ -844,9 +924,7 @@ namespace internal {
 // from user test code.  GetTestTypeId() is guaranteed to always
 // return the same value, as it always calls GetTypeId<>() from the
 // gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
@@ -861,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
                                      const TestPartResultArray& results,
                                      TestPartResult::Type type,
                                      const std::string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
   Message msg;
   if (results.size() != 1) {
     msg << "Expected: " << expected << "\n"
@@ -882,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
   }
 
   if (strstr(r.message(), substr.c_str()) == nullptr) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
   }
 
   return AssertionSuccess();
@@ -908,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() {
 }
 
 DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -917,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
 }
 
 DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -1024,11 +1104,10 @@ int UnitTestImpl::test_to_run_count() const {
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
   return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
-      skip_count + 1
+      static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
       // Skips the user-specified number of frames plus this function
       // itself.
-      );  // NOLINT
+  );  // NOLINT
 }
 
 // A helper class for measuring elapsed times.
@@ -1072,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
   const int unicode_length =
       MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
   WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
   unicode[unicode_length] = 0;
   return unicode;
 }
@@ -1082,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the ANSI string, or NULL if the
 // input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
   if (!utf16_str) return nullptr;
   const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
                                               0, nullptr, nullptr);
@@ -1101,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
 // Unlike strcmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
+bool String::CStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -1115,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
 // encoding, and streams the result to the given Message object.
 static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
                                      Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
+  for (size_t i = 0; i != length;) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
+      while (i != length && wstr[i] != L'\0') i++;
     } else {
       *msg << '\0';
       i++;
@@ -1161,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) {
 
 // These two overloads allow streaming a wide C string to a Message
 // using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
+Message& Message::operator<<(const wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
-Message& Message::operator <<(wchar_t* wide_c_str) {
+Message& Message::operator<<(wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
 
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
+Message& Message::operator<<(const ::std::wstring& wstr) {
   internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
   return *this;
 }
@@ -1183,44 +1260,6 @@ std::string Message::GetString() const {
   return internal::StringStreamToString(ss_.get());
 }
 
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != nullptr
-                   ? new ::std::string(*other.message_)
-                   : static_cast< ::std::string*>(nullptr)) {}
-
-// Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
-  using std::swap;
-  swap(success_, other.success_);
-  swap(message_, other.message_);
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != nullptr) negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
 namespace internal {
 
 namespace edit_distance {
@@ -1512,8 +1551,7 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
 AssertionResult EqFailure(const char* lhs_expression,
                           const char* rhs_expression,
                           const std::string& lhs_value,
-                          const std::string& rhs_value,
-                          bool ignoring_case) {
+                          const std::string& rhs_value, bool ignoring_case) {
   Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs_expression;
@@ -1530,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression,
   }
 
   if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines =
-        SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines =
-        SplitEscapedString(rhs_value);
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
     if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
           << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1545,27 +1581,21 @@ AssertionResult EqFailure(const char* lhs_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value) {
   const char* actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
       << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
   msg << "\nExpected: " << expected_predicate_value;
   return msg.GetString();
 }
 
 // Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+                                     const char* abs_error_expr, double val1,
+                                     double val2, double abs_error) {
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
@@ -1595,20 +1625,17 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
               "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
   }
   return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
 }
 
-
 // Helper template for implementing FloatLE() and DoubleLE().
 template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+                                RawType val1, RawType val2) {
   // Returns success if val1 is less than val2,
   if (val1 < val2) {
     return AssertionSuccess();
@@ -1633,24 +1660,24 @@ AssertionResult FloatingPointLE(const char* expr1,
           << val2;
 
   return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
 }
 
 }  // namespace internal
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+                        float val2) {
   return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
 }
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+                         double val2) {
   return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
 }
 
@@ -1658,62 +1685,51 @@ namespace internal {
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const char* lhs,
+                               const char* rhs_expression, const char* lhs,
                                const char* rhs) {
   if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const char* lhs,
+                                   const char* rhs_expression, const char* lhs,
                                    const char* rhs) {
   if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   true);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
+                               const char* s2_expression, const char* s1,
                                const char* s2) {
   if (!String::CStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
+                                   const char* s2_expression, const char* s1,
                                    const char* s2) {
   if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
     return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
@@ -1741,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
 
 // StringType here can be either ::std::string or ::std::wstring.
 template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
   return haystack.find(needle) != StringType::npos;
 }
 
@@ -1751,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle,
 // StringType here can be const char*, const wchar_t*, ::std::string,
 // or ::std::wstring.
 template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char* needle_expr,
+                                const char* haystack_expr,
+                                const StringType& needle,
+                                const StringType& haystack) {
   if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
     return AssertionSuccess();
 
   const bool is_wide_string = sizeof(needle[0]) > 1;
   const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
   return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
 }
 
 }  // namespace
@@ -1774,52 +1790,52 @@ AssertionResult IsSubstringImpl(
 // substring of haystack (NULL is considered a substring of itself
 // only), and return an appropriate error message when they fail.
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const char* needle, const char* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const wchar_t* needle, const wchar_t* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const char* needle,
+                               const char* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const wchar_t* needle,
+                               const wchar_t* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::string& needle,
+                            const ::std::string& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::string& needle,
+                               const ::std::string& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
 #if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::wstring& needle,
+                            const ::std::wstring& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::wstring& needle,
+                               const ::std::wstring& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 #endif  // GTEST_HAS_STD_WSTRING
@@ -1831,43 +1847,42 @@ namespace internal {
 namespace {
 
 // Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
 
-# else
+#else
 
   // Looks up the human-readable system message for the HRESULT code
   // and since we're not passing any params to FormatMessage, we don't
   // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
   const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
+  char error_text[kBufSize] = {'\0'};
   DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,   // no source, we're asking system
+                                          0,  // no source, we're asking system
                                           static_cast<DWORD>(hr),  // the error
-                                          0,   // no line width restrictions
+                                          0,  // no line width restrictions
                                           error_text,  // output buffer
                                           kBufSize,    // buf size
                                           nullptr);  // no arguments for inserts
   // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
+       --message_length) {
     error_text[message_length - 1] = '\0';
   }
 
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
   const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
@@ -1901,16 +1916,18 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 //  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 // The maximum code-point a one-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) <<  7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
 
 // The maximum code-point a two-byte UTF-8 sequence can represent.
 constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
 
 // The maximum code-point a three-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint3 = (static_cast<uint32_t>(1) << (4 + 2*6)) - 1;
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
 
 // The maximum code-point a four-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint4 = (static_cast<uint32_t>(1) << (3 + 3*6)) - 1;
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
 
 // Chops off the n lowest bits from a bit pattern.  Returns the n
 // lowest bits.  As a side effect, the original bit pattern will be
@@ -1935,7 +1952,7 @@ std::string CodePointToUtf8(uint32_t code_point) {
   char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
   } else if (code_point <= kMaxCodePoint2) {
     str[2] = '\0';
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
@@ -1963,8 +1980,8 @@ std::string CodePointToUtf8(uint32_t code_point) {
 // and thus should be combined into a single Unicode code point
 // using CreateCodePointFromUtf16SurrogatePair.
 inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
 }
 
 // Creates a Unicode code point from UTF16 surrogate pair.
@@ -1995,8 +2012,7 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
 std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
 
   ::std::stringstream stream;
   for (int i = 0; i < num_chars; ++i) {
@@ -2005,8 +2021,8 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
     if (str[i] == L'\0') {
       break;
     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
       i++;
     } else {
       unicode_code_point = static_cast<uint32_t>(str[i]);
@@ -2019,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
 
 // Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
   if (wide_c_str == nullptr) return "(null)";
 
   return internal::WideStringToUtf8(wide_c_str, -1);
@@ -2031,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) {
 // Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -2041,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 
 // Helper function for *_STREQ on wide strings.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const wchar_t* lhs,
+                               const char* rhs_expression, const wchar_t* lhs,
                                const wchar_t* rhs) {
   if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // Helper function for *_STRNE on wide strings.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
+                               const char* s2_expression, const wchar_t* s1,
                                const wchar_t* s2) {
   if (!String::WideCStringEquals(s1, s2)) {
     return AssertionSuccess();
   }
 
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
 }
 
 // Compares two C strings, ignoring case.  Returns true if and only if they have
@@ -2076,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 // Unlike strcasecmp(), this function can handle NULL argument(s).  A
 // NULL C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
   if (rhs == nullptr) return false;
   return posix::StrCaseCmp(lhs, rhs) == 0;
@@ -2118,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
 
 // Returns true if and only if str ends with the given suffix, ignoring case.
 // Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
+bool String::EndsWithCaseInsensitive(const std::string& str,
+                                     const std::string& suffix) {
   const size_t str_len = str.length();
   const size_t suffix_len = suffix.length();
   return (str_len >= suffix_len) &&
@@ -2202,15 +2212,13 @@ TestResult::TestResult()
     : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
 
 // D'tor.
-TestResult::~TestResult() {
-}
+TestResult::~TestResult() {}
 
 // Returns the i-th test part result among all the results. i can
 // range from 0 to total_part_count() - 1. If i is not in that range,
 // aborts the program.
 const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
   return test_part_results_.at(static_cast<size_t>(i));
 }
 
@@ -2218,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const {
 // test_property_count() - 1. If i is not in that range, aborts the
 // program.
 const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
   return test_properties_.at(static_cast<size_t>(i));
 }
 
 // Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
 
 // Adds a test part result to the list.
 void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
@@ -2255,15 +2260,8 @@ void TestResult::RecordProperty(const std::string& xml_element,
 // The list of reserved attributes used in the <testsuites> element of XML
 // output.
 static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
+    "disabled",    "errors", "failures", "name",
+    "random_seed", "tests",  "time",     "timestamp"};
 
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
@@ -2273,8 +2271,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-    "classname",   "name", "status", "time",  "type_param",
-    "value_param", "file", "line"};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 // Use a slightly different set for allowed output to ensure existing tests can
 // still RecordProperty("result") or "RecordProperty(timestamp")
@@ -2336,7 +2334,7 @@ static bool ValidateTestPropertyName(
     const std::string& property_name,
     const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
+      reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
                   << " (" << FormatWordList(reserved_names)
                   << " are reserved by " << GTEST_NAME_ << ")";
@@ -2374,8 +2372,7 @@ bool TestResult::Skipped() const {
 // Returns true if and only if the test failed.
 bool TestResult::Failed() const {
   for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
+    if (GetTestPartResult(i).failed()) return true;
   }
   return false;
 }
@@ -2416,27 +2413,22 @@ int TestResult::test_property_count() const {
 // Creates a Test object.
 
 // The c'tor saves the states of all flags.
-Test::Test()
-    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
 
 // The d'tor restores the states of all flags.  The actual work is
 // done by the d'tor of the gtest_flag_saver_ field, and thus not
 // visible here.
-Test::~Test() {
-}
+Test::~Test() {}
 
 // Sets up the test fixture.
 //
 // A sub-class may override this.
-void Test::SetUp() {
-}
+void Test::SetUp() {}
 
 // Tears down the test fixture.
 //
 // A sub-class may override this.
-void Test::TearDown() {
-}
+void Test::TearDown() {}
 
 // Allows user supplied key value pairs to be recorded for later output.
 void Test::RecordProperty(const std::string& key, const std::string& value) {
@@ -2541,8 +2533,8 @@ bool Test::HasSameFixtureClass() {
 static std::string* FormatSehExceptionMessage(DWORD exception_code,
                                               const char* location) {
   Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
 
   return new std::string(message.GetString());
 }
@@ -2585,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException(
 // exceptions in the same function.  Therefore, we provide a separate
 // wrapper function for handling SEH exceptions.)
 template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                              const char* location) {
 #if GTEST_HAS_SEH
   __try {
     return (object->*method)();
@@ -2595,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported(
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
+    std::string* exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
     delete exception_message;
@@ -2612,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported(
 // exceptions, if they are supported; returns the 0-value for type
 // Result in case of an SEH exception.
 template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                           const char* location) {
   // NOTE: The user code can affect the way in which Google Test handles
   // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
   // RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2623,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported(
   // try {
   //   // Perform the test method.
   // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
+  //   if (GTEST_FLAG_GET(catch_exceptions))
   //     // Report the exception as failure.
   //   else
   //     throw;  // Re-throws the original exception.
@@ -2679,16 +2671,16 @@ void Test::Run() {
   // GTEST_SKIP().
   if (!HasFatalFailure() && !IsSkipped()) {
     impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
   }
 
   // However, we want to clean up as much as possible.  Hence we will
   // always call TearDown(), even if SetUp() or the test body has
   // failed.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
 }
 
 // Returns true if and only if the current test has a fatal failure.
@@ -2698,8 +2690,9 @@ bool Test::HasFatalFailure() {
 
 // Returns true if and only if the current test has a non-fatal failure.
 bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
 }
 
 // Returns true if and only if the current test was skipped.
@@ -2799,11 +2792,10 @@ class TestNameIs {
   // Constructor.
   //
   // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
+  explicit TestNameIs(const char* name) : name_(name) {}
 
   // Returns true if and only if the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
+  bool operator()(const TestInfo* test_info) const {
     return test_info && test_info->name() == name_;
   }
 
@@ -2831,20 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() {
 // Creates the test object, runs it, records its result, and then
 // deletes it.
 void TestInfo::Run() {
-  if (!should_run_) return;
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  if (!should_run_) {
+    if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+    return;
+  }
 
   // Tells UnitTest where to store test result.
   internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->set_current_test_info(this);
 
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
-
   result_.set_start_timestamp(internal::GetTimeInMillis());
   internal::Timer timer;
-
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
   // Creates the test object.
@@ -3009,11 +3001,18 @@ void TestSuite::Run() {
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
+  const bool skip_all = ad_hoc_test_result().Failed();
+
   start_timestamp_ = internal::GetTimeInMillis();
   internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-    if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+    if (skip_all) {
+      GetMutableTestInfo(i)->Skip();
+    } else {
+      GetMutableTestInfo(i)->Run();
+    }
+    if (GTEST_FLAG_GET(fail_fast) &&
+        GetMutableTestInfo(i)->result()->Failed()) {
       for (int j = i + 1; j < total_test_count(); j++) {
         GetMutableTestInfo(j)->Skip();
       }
@@ -3089,11 +3088,10 @@ void TestSuite::UnshuffleTests() {
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
+static std::string FormatCountableNoun(int count, const char* singular_form,
+                                       const char* plural_form) {
   return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
+         (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
@@ -3110,7 +3108,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
 // representation.  Both kNonFatalFailure and kFatalFailure are translated
 // to "Failure", as the user usually doesn't care about the difference
 // between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
     case TestPartResult::kSkip:
       return "Skipped\n";
@@ -3137,17 +3135,18 @@ enum class GTestColor { kDefault, kRed, kGreen, kYellow };
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
     const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
 }
 
 // Prints a TestPartResult.
 static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
+  const std::string& result = PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
   // If the test program runs in Visual Studio or a debugger, the
@@ -3164,8 +3163,8 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
 }
 
 // class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
@@ -3176,7 +3175,8 @@ static WORD GetColorAttribute(GTestColor color) {
       return FOREGROUND_GREEN;
     case GTestColor::kYellow:
       return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
+    default:
+      return 0;
   }
 }
 
@@ -3232,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) {
 
 // Returns true if and only if Google Test should use colors in the output.
 bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
+  std::string c = GTEST_FLAG_GET(color);
+  const char* const gtest_color = c.c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
@@ -3259,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
   }
 
   return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
   // We take "yes", "true", "t", and "1" as meaning "yes".  If the
   // value is neither one of these nor "auto", we treat it as "no" to
   // be conservative.
@@ -3273,18 +3274,13 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // that would be colored when printed, as can be done on Linux.
 
 GTEST_ATTRIBUTE_PRINTF_(2, 3)
-static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
-    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
-  const bool use_color = AlwaysFalse();
-#else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
   const bool use_color = in_color_mode && (color != GTestColor::kDefault);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
     vprintf(fmt, args);
@@ -3292,8 +3288,8 @@ static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -3364,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3384,13 +3381,14 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
   static void PrintSkippedTests(const UnitTest& unit_test);
 };
 
-  // Fired before each iteration of tests starts.
+// Fired before each iteration of tests starts.
 void PrettyUnitTestResultPrinter::OnTestIterationStart(
     const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
+  if (GTEST_FLAG_GET(repeat) != 1)
     printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 
-  const char* const filter = GTEST_FLAG(filter).c_str();
+  std::string f = GTEST_FLAG_GET(filter);
+  const char* const filter = f.c_str();
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
@@ -3406,7 +3404,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
@@ -3462,6 +3460,13 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
   fflush(stdout);
 }
 
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
 // Called after an assertion failure.
 void PrettyUnitTestResultPrinter::OnTestPartResult(
     const TestPartResult& result) {
@@ -3486,12 +3491,12 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
     ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
 
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
   } else {
     printf("\n");
   }
@@ -3500,7 +3505,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
 
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
@@ -3511,7 +3516,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
 }
 #else
 void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
@@ -3607,7 +3612,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
@@ -3628,7 +3633,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
@@ -3664,6 +3669,7 @@ class BriefUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3700,7 +3706,7 @@ void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
     PrintTestName(test_info.test_suite_name(), test_info.name());
     PrintFullTestCommentIfPresent(test_info);
 
-    if (GTEST_FLAG(print_time)) {
+    if (GTEST_FLAG_GET(print_time)) {
       printf(" (%s ms)\n",
              internal::StreamableToString(test_info.result()->elapsed_time())
                  .c_str());
@@ -3717,7 +3723,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
@@ -3732,7 +3738,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
@@ -3752,7 +3758,7 @@ class TestEventRepeater : public TestEventListener {
  public:
   TestEventRepeater() : forwarding_enabled_(true) {}
   ~TestEventRepeater() override;
-  void Append(TestEventListener *listener);
+  void Append(TestEventListener* listener);
   TestEventListener* Release(TestEventListener* listener);
 
   // Controls whether events will be forwarded to listeners_. Set to false
@@ -3770,6 +3776,7 @@ class TestEventRepeater : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   void OnTestSuiteStart(const TestSuite& parameter) override;
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
 //  Legacy API is deprecated but still available
@@ -3789,18 +3796,19 @@ class TestEventRepeater : public TestEventListener {
   // The list of listeners that receive events.
   std::vector<TestEventListener*> listeners_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+  TestEventRepeater(const TestEventRepeater&) = delete;
+  TestEventRepeater& operator=(const TestEventRepeater&) = delete;
 };
 
 TestEventRepeater::~TestEventRepeater() {
   ForEach(listeners_, Delete<TestEventListener>);
 }
 
-void TestEventRepeater::Append(TestEventListener *listener) {
+void TestEventRepeater::Append(TestEventListener* listener) {
   listeners_.push_back(listener);
 }
 
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
       listeners_.erase(listeners_.begin() + static_cast<int>(i));
@@ -3813,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
 
 // Since most methods are very similar, use macros to reduce boilerplate.
 // This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
 // This defines a member that forwards the call to all listeners in reverse
 // order.
 #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
@@ -3840,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
 GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
 GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
 GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
@@ -3890,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  private:
   // Is c a whitespace character that is normalized to a space character
   // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
+  static bool IsNormalizableWhitespace(unsigned char c) {
+    return c == '\t' || c == '\n' || c == '\r';
   }
 
   // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
+  // https://www.w3.org/TR/REC-xml/#charsets
+  static bool IsValidXmlCharacter(unsigned char c) {
     return IsNormalizableWhitespace(c) || c >= 0x20;
   }
 
@@ -3965,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+  XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+  XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
 };
 
 // Creates a new XmlUnitTestResultPrinter.
@@ -4005,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+                                                bool is_attribute) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
@@ -4034,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml(
           m << '"';
         break;
       default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
+        if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+          if (is_attribute &&
+              IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
             m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
               << ";";
           else
@@ -4056,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
   std::string output;
   output.reserve(str.size());
   for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
+    if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
       output.push_back(*it);
 
   return output;
@@ -4064,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
-// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -4113,12 +4124,12 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "." +
-      String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "." +
+         String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -4129,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
   for (;;) {
     const char* const next_segment = strstr(segment, "]]>");
     if (next_segment != nullptr) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
       *stream << "]]>]]&gt;<![CDATA[";
       segment = next_segment + strlen("]]>");
     } else {
@@ -4142,15 +4153,13 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
 }
 
 void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, const std::string& value) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Attribute " << name << " is not allowed for element <" << element_name
       << ">.";
 
@@ -4216,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuite, "type_param",
                        test_info.type_param());
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
-    OutputXmlAttribute(stream, kTestsuite, "line",
-                       StreamableToString(test_info.line()));
+
+  OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+  OutputXmlAttribute(stream, kTestsuite, "line",
+                     StreamableToString(test_info.line()));
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << " />\n";
     return;
   }
@@ -4254,8 +4264,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary)
+      *stream << "      <failure message=\"" << EscapeXmlAttribute(summary)
               << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
@@ -4295,7 +4304,7 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_suite.reportable_test_count()));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputXmlAttribute(stream, kTestsuite, "failures",
                        StreamableToString(test_suite.failed_test_count()));
     OutputXmlAttribute(
@@ -4343,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
       stream, kTestsuites, "timestamp",
       FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
@@ -4396,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
     attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
   }
   return attributes.GetString();
 }
@@ -4410,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
     return;
   }
 
-  *stream << "<" << kProperties << ">\n";
+  *stream << "      <" << kProperties << ">\n";
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    *stream << "<" << kProperty;
+    *stream << "        <" << kProperty;
     *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
     *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
     *stream << "/>\n";
   }
-  *stream << "</" << kProperties << ">\n";
+  *stream << "      </" << kProperties << ">\n";
 }
 
 // End XmlUnitTestResultPrinter
@@ -4442,16 +4451,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   //// streams the attribute as JSON.
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            const std::string& value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, const std::string& value,
+                            const std::string& indent, bool comma = true);
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            int value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, int value,
+                            const std::string& indent, bool comma = true);
 
   // Streams a test suite JSON stanza containing the given test result.
   //
@@ -4484,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+  JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+  JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+      delete;
 };
 
 // Creates a new JsonUnitTestResultPrinter.
@@ -4496,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
 }
 
 void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
+                                                   int /*iteration*/) {
   FILE* jsonout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintJsonUnitTest(&stream, unit_test);
@@ -4562,55 +4569,48 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
 }
 
 static inline std::string Indent(size_t width) {
   return std::string(width, ' ');
 }
 
-void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value,
-    const std::string& indent,
-    bool comma) {
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+                                              const std::string& element_name,
+                                              const std::string& name,
+                                              const std::string& value,
+                                              const std::string& indent,
+                                              bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    int value,
-    const std::string& indent,
-    bool comma) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, int value, const std::string& indent, bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": " << StreamableToString(value);
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 // Streams a test suite JSON stanza containing the given test result.
@@ -4620,7 +4620,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
   *stream << Indent(4) << "{\n";
   OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
   OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
     OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
     OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
@@ -4674,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
     OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
                   kIndent);
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+
+  OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << "\n" << Indent(8) << "}";
     return;
+  } else {
+    *stream << ",\n";
   }
 
   OutputJsonKey(stream, kTestsuite, "status",
@@ -4710,7 +4713,9 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
     if (part.failed()) {
       *stream << ",\n";
       if (++failures == 1) {
-        *stream << kIndent << "\"" << "failures" << "\": [\n";
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
       }
       const std::string location =
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
@@ -4723,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
     }
   }
 
-  if (failures > 0)
-    *stream << "\n" << kIndent << "]";
+  if (failures > 0) *stream << "\n" << kIndent << "]";
   *stream << "\n" << Indent(8) << "}";
 }
 
@@ -4738,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite(
   OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
   OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
                 kIndent);
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, kTestsuite, "failures",
                   test_suite.failed_test_count(), kIndent);
     OutputJsonKey(stream, kTestsuite, "disabled",
@@ -4785,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
   OutputJsonKey(stream, kTestsuites, "disabled",
                 unit_test.reportable_disabled_test_count(), kIndent);
   OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
                   kIndent);
   }
@@ -4820,7 +4824,9 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
     OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
   }
 
-  *stream << "\n" << kIndent << "]\n" << "}\n";
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
 }
 
 void JsonUnitTestResultPrinter::PrintJsonTestList(
@@ -4855,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
                << "\"" << EscapeJson(property.value()) << "\"";
   }
   return attributes.GetString();
@@ -4895,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() {
 
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
   hints.ai_socktype = SOCK_STREAM;
   addrinfo* servinfo = nullptr;
 
   // Use the getaddrinfo() to get a linked list of IP addresses for
   // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
   if (error_num != 0) {
     GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
                         << gai_strerror(error_num);
@@ -4911,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() {
   // Loop through all the results and connect to the first we can.
   for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
        cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
     if (sockfd_ != -1) {
       // Connect the client socket to the server socket.
       if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -4962,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   for (int i = 0; i < raw_stack_size; ++i) {
     if (raw_stack[i] == caller_frame &&
-        !GTEST_FLAG(show_internal_stack_frames)) {
+        !GTEST_FLAG_GET(show_internal_stack_frames)) {
       // Add a marker to the trace and stop adding frames.
       absl::StrAppend(&result, kElidedFramesMarker, "\n");
       break;
@@ -4981,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   return result;
 
-#else  // !GTEST_HAS_ABSL
+#else   // !GTEST_HAS_ABSL
   static_cast<void>(max_depth);
   static_cast<void>(skip_count);
   return "";
@@ -5005,14 +5012,14 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath ?
-                                 premature_exit_filepath : "") {
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
     if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
       fwrite("0", 1, 1, pfile);
       fclose(pfile);
     }
@@ -5034,7 +5041,8 @@ class ScopedPrematureExitFile {
  private:
   const std::string premature_exit_filepath_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+  ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+  ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
 };
 
 }  // namespace internal
@@ -5208,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
 // Gets the time of the test program start, in ms from the start of the
 // UNIX epoch.
 internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
+  return impl()->start_timestamp();
 }
 
 // Gets the elapsed time, in milliseconds.
@@ -5251,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) {
 
 // Returns the list of event listeners that can be used to track events
 // inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
 
 // Registers and returns a global test environment.  When a test
 // program is run, all global test environments will be set-up in the
@@ -5278,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) {
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char* file_name, int line_number,
+                                 const std::string& message,
+                                 const std::string& os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -5293,8 +5298,9 @@ void UnitTest::AddTestPartResult(
 
     for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
       const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
     }
   }
 
@@ -5304,8 +5310,8 @@ void UnitTest::AddTestPartResult(
 
   const TestPartResult result = TestPartResult(
       result_type, file_name, line_number, msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
 
   if (result_type != TestPartResult::kSuccess &&
       result_type != TestPartResult::kSkip) {
@@ -5314,7 +5320,7 @@ void UnitTest::AddTestPartResult(
     // in the code (perhaps in order to use Google Test assertions
     // with another testing framework) and specify the former on the
     // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
+    if (GTEST_FLAG_GET(break_on_failure)) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
       // Using DebugBreak on Windows allows gtest to still break into a debugger
       // when a failure happens and both the --gtest_break_on_failure and
@@ -5331,7 +5337,7 @@ void UnitTest::AddTestPartResult(
       // portability: some debuggers don't correctly trap abort().
       *static_cast<volatile int*>(nullptr) = 1;
 #endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
+    } else if (GTEST_FLAG_GET(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
       throw internal::GoogleTestFailureException(result);
 #else
@@ -5360,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key,
 // from the main thread.
 int UnitTest::Run() {
   const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+      GTEST_FLAG_GET(internal_run_death_test).length() > 0;
 
   // Google Test implements this protocol for catching that a test
   // program exits before returning control to Google Test:
@@ -5390,7 +5396,7 @@ int UnitTest::Run() {
 
   // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
   // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+  impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
 
 #if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
@@ -5398,26 +5404,26 @@ int UnitTest::Run() {
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
     // Death test children can be terminated with _abort().  On Windows,
     // _abort() can show a dialog with a warning message.  This forces the
     // abort message to go to stderr instead.
     _set_error_mode(_OUT_TO_STDERR);
-# endif
+#endif
 
-# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
     // In the debug version, Visual Studio pops up a separate dialog
     // offering a choice to debug the aborted program. We need to suppress
     // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
     // executed. Google Test will notify the user of any unexpected
     // failure via stderr.
-    if (!GTEST_FLAG(break_on_failure))
+    if (!GTEST_FLAG_GET(break_on_failure))
       _set_abort_behavior(
           0x0,                                    // Clear the following flags:
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
@@ -5431,14 +5437,15 @@ int UnitTest::Run() {
                               _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
       (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
     }
-# endif
+#endif
   }
 #endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
 }
 
 // Returns the working directory when the first TEST() or TEST_F() was
@@ -5483,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
 }
 
 // Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
 
 // Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
+UnitTest::~UnitTest() { delete impl_; }
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
@@ -5501,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
@@ -5599,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
 // Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
+  const std::string& target = GTEST_FLAG_GET(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
     if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
     } else {
       GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
                           << "\" ignored.";
@@ -5642,7 +5644,7 @@ void UnitTestImpl::PostFlagParsingInit() {
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
-    if (GTEST_FLAG(brief)) {
+    if (GTEST_FLAG_GET(brief)) {
       listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
     }
 
@@ -5652,7 +5654,7 @@ void UnitTestImpl::PostFlagParsingInit() {
 #endif  // GTEST_CAN_STREAM_RESULTS_
 
 #if GTEST_HAS_ABSL
-    if (GTEST_FLAG(install_failure_signal_handler)) {
+    if (GTEST_FLAG_GET(install_failure_signal_handler)) {
       absl::FailureSignalHandlerOptions options;
       absl::InstallFailureSignalHandler(options);
     }
@@ -5710,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite(
   auto* const new_test_suite =
       new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 
+  const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
   // Is this a death test suite?
-  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
-                                               kDeathTestSuiteFilter)) {
+  if (death_test_suite_filter.MatchesName(test_suite_name)) {
     // Yes.  Inserts the test suite after the last death test suite
     // defined so far.  This only works when the test suites haven't
     // been shuffled.  Otherwise we may end up running a death test
@@ -5749,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() {
   const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
+  if (g_help_flag) return true;
 
   // Repeats the call to the post-flag parsing initialization in case the
   // user didn't call InitGoogleTest.
@@ -5768,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() {
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test =
       (internal_run_death_test_flag_.get() != nullptr);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
   if (in_subprocess_for_death_test) {
     GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
   }
-# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -5780,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() {
 
   // Compares the full test names with the filter to decide which
   // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
 
   // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
+  if (GTEST_FLAG_GET(list_tests)) {
     // This must be called *after* FilterTests() has been called.
     ListTestsMatchingFilter();
     return true;
   }
 
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+  random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
 
   // True if and only if at least one test has failed.
   bool failed = false;
@@ -5804,9 +5804,21 @@ bool UnitTestImpl::RunAllTests() {
 
   // How many times to repeat the tests?  We don't want to repeat them
   // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
   // Repeats forever if the repeat count is negative.
   const bool gtest_repeat_forever = repeat < 0;
+
+  // Should test environments be set up and torn down for each repeat, or only
+  // set up on the first and torn down on the last iteration? If there is no
+  // "last" iteration because the tests will repeat forever, always recreate the
+  // environments to avoid leaks in case one of the environments is using
+  // resources that are external to this process. Without this check there would
+  // be no way to clean up those external resources automatically.
+  const bool recreate_environments_when_repeating =
+      GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+      gtest_repeat_forever;
+
   for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
     // We want to preserve failures generated by ad-hoc test
     // assertions executed before RUN_ALL_TESTS().
@@ -5815,7 +5827,7 @@ bool UnitTestImpl::RunAllTests() {
     Timer timer;
 
     // Shuffles test suites and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+    if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
       random()->Reseed(static_cast<uint32_t>(random_seed_));
       // This should be done before calling OnTestIterationStart(),
       // such that a test event listener can see the actual test order
@@ -5828,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() {
 
     // Runs each test suite if there is at least one test to run.
     if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
+      // Sets up all environments beforehand. If test environments aren't
+      // recreated for each iteration, only do so on the first iteration.
+      if (i == 0 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsSetUpStart(*parent_);
+        ForEach(environments_, SetUpEnvironment);
+        repeater->OnEnvironmentsSetUpEnd(*parent_);
+      }
 
       // Runs the tests only if there was no fatal failure or skip triggered
       // during global set-up.
@@ -5853,7 +5868,7 @@ bool UnitTestImpl::RunAllTests() {
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
-          if (GTEST_FLAG(fail_fast) &&
+          if (GTEST_FLAG_GET(fail_fast) &&
               GetMutableSuiteCase(test_index)->Failed()) {
             for (int j = test_index + 1; j < total_test_suite_count(); j++) {
               GetMutableSuiteCase(j)->Skip();
@@ -5871,11 +5886,15 @@ bool UnitTestImpl::RunAllTests() {
         }
       }
 
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
+      // Tears down all environments in reverse order afterwards. If test
+      // environments aren't recreated for each iteration, only do so on the
+      // last iteration.
+      if (i == repeat - 1 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsTearDownStart(*parent_);
+        std::for_each(environments_.rbegin(), environments_.rend(),
+                      TearDownEnvironment);
+        repeater->OnEnvironmentsTearDownEnd(*parent_);
+      }
     }
 
     elapsed_time_ = timer.Elapsed();
@@ -5896,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() {
     // (it's always safe to unshuffle the tests).
     UnshuffleTests();
 
-    if (GTEST_FLAG(shuffle)) {
+    if (GTEST_FLAG_GET(shuffle)) {
       // Picks a new random seed for each iteration.
       random_seed_ = GetNextRandomSeed(random_seed_);
     }
@@ -5947,8 +5966,7 @@ void WriteToShardStatusFileIfNeeded() {
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
                  bool in_subprocess_for_death_test) {
   if (in_subprocess_for_death_test) {
     return false;
@@ -5960,27 +5978,27 @@ bool ShouldShard(const char* total_shards_env,
   if (total_shards == -1 && shard_index == -1) {
     return false;
   } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
     const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
@@ -6022,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
 // . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
+
+  const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+      GTEST_FLAG_GET(filter));
+  const UnitTestFilter disable_test_filter(kDisableTestFilter);
   // num_runnable_tests are the number of tests that will
   // run across all shards (i.e., match filter and are not disabled).
   // num_selected_tests are the number of tests to be run on
@@ -6042,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
       const std::string test_name(test_info->name());
       // A test is disabled if test suite name or test name matches
       // kDisableTestFilter.
-      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
-                                   test_suite_name, kDisableTestFilter) ||
-                               internal::UnitTestOptions::MatchesFilter(
-                                   test_name, kDisableTestFilter);
+      const bool is_disabled =
+          disable_test_filter.MatchesName(test_suite_name) ||
+          disable_test_filter.MatchesName(test_name);
       test_info->is_disabled_ = is_disabled;
 
-      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
-          test_suite_name, test_name);
+      const bool matches_filter =
+          gtest_flag_filter.MatchesTest(test_suite_name, test_name);
       test_info->matches_filter_ = matches_filter;
 
       const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
       const bool is_in_another_shard =
@@ -6222,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
   return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
@@ -6233,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
 // suppress unreachable code warnings.
 namespace {
 class ClassUniqueToAlwaysTrue {};
-}
+}  // namespace
 
 bool IsTrue(bool condition) { return condition; }
 
@@ -6241,8 +6263,7 @@ bool AlwaysTrue() {
 #if GTEST_HAS_EXCEPTIONS
   // This condition is always false so AlwaysTrue() never actually throws,
   // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
 #endif  // GTEST_HAS_EXCEPTIONS
   return true;
 }
@@ -6264,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseFlagValue(const char* str, const char* flag,
+static const char* ParseFlagValue(const char* str, const char* flag_name,
                                   bool def_optional) {
   // str and flag must not be NULL.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag_name == nullptr) return nullptr;
 
   // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const std::string flag_str =
+      std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
   const size_t flag_len = flag_str.length();
   if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
 
@@ -6301,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
+  const char* const value_str = ParseFlagValue(str, flag_name, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -6317,16 +6339,16 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
+  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+                    value);
 }
 
 // Parses a string for a string flag, in the form of "--flag=value".
@@ -6334,9 +6356,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 template <typename String>
-static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -6353,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) {
 // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
 // internal flags and do not trigger the help message.
 static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
           SkipPrefix("/", &str)) &&
          !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
          (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -6437,6 +6458,10 @@ static const char kColorEncodedHelpMessage[] =
     "random_seed=@Y[NUMBER]@D\n"
     "      Random number seed to use for shuffling test orders (between 1 and\n"
     "      99999, or 0 to use a seed based on the current time).\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "recreate_environments_when_repeating@D\n"
+    "      Sets up and tears down the global test environment on each repeat\n"
+    "      of the test.\n"
     "\n"
     "Test Output:\n"
     "  @G--" GTEST_FLAG_PREFIX_
@@ -6454,18 +6479,18 @@ static const char kColorEncodedHelpMessage[] =
     "      Generate a JSON or XML report in the given directory or with the "
     "given\n"
     "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-# if GTEST_CAN_STREAM_RESULTS_
+#if GTEST_CAN_STREAM_RESULTS_
     "  @G--" GTEST_FLAG_PREFIX_
     "stream_result_to=@YHOST@G:@YPORT@D\n"
     "      Stream test results to the given server.\n"
-# endif  // GTEST_CAN_STREAM_RESULTS_
+#endif  // GTEST_CAN_STREAM_RESULTS_
     "\n"
     "Assertion Behavior:\n"
-# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
     "      Set the default death test style.\n"
-# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "break_on_failure@D\n"
     "      Turn assertion failures into debugger break-points.\n"
@@ -6497,41 +6522,44 @@ static const char kColorEncodedHelpMessage[] =
     "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
 static bool ParseGoogleTestFlag(const char* const arg) {
-  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                       &GTEST_FLAG(also_run_disabled_tests)) ||
-         ParseBoolFlag(arg, kBreakOnFailureFlag,
-                       &GTEST_FLAG(break_on_failure)) ||
-         ParseBoolFlag(arg, kCatchExceptionsFlag,
-                       &GTEST_FLAG(catch_exceptions)) ||
-         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-         ParseStringFlag(arg, kDeathTestStyleFlag,
-                         &GTEST_FLAG(death_test_style)) ||
-         ParseBoolFlag(arg, kDeathTestUseFork,
-                       &GTEST_FLAG(death_test_use_fork)) ||
-         ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
-         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-         ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                         &GTEST_FLAG(internal_run_death_test)) ||
-         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-         ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
-         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-         ParseInt32Flag(arg, kStackTraceDepthFlag,
-                        &GTEST_FLAG(stack_trace_depth)) ||
-         ParseStringFlag(arg, kStreamResultToFlag,
-                         &GTEST_FLAG(stream_result_to)) ||
-         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name)  \
+  do {                                        \
+    auto value = GTEST_FLAG_GET(flag_name);   \
+    if (ParseFlag(arg, #flag_name, &value)) { \
+      GTEST_FLAG_SET(flag_name, value);       \
+      return true;                            \
+    }                                         \
+  } while (false)
+
+  GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+  GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+  GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+  GTEST_INTERNAL_PARSE_FLAG(color);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+  GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+  GTEST_INTERNAL_PARSE_FLAG(filter);
+  GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+  GTEST_INTERNAL_PARSE_FLAG(list_tests);
+  GTEST_INTERNAL_PARSE_FLAG(output);
+  GTEST_INTERNAL_PARSE_FLAG(brief);
+  GTEST_INTERNAL_PARSE_FLAG(print_time);
+  GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+  GTEST_INTERNAL_PARSE_FLAG(random_seed);
+  GTEST_INTERNAL_PARSE_FLAG(repeat);
+  GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+  GTEST_INTERNAL_PARSE_FLAG(shuffle);
+  GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+  GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+  GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+  return false;
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 static void LoadFlagsFromFile(const std::string& path) {
   FILE* flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
-    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
                       << "\"";
   }
   std::string contents(ReadEntireFile(flagfile));
@@ -6539,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) {
   std::vector<std::string> lines;
   SplitString(contents, '\n', &lines);
   for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty())
-      continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str()))
-      g_help_flag = true;
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
   }
 }
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6552,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) {
 // instantiated to either char or wchar_t.
 template <typename CharType>
 void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  std::string flagfile_value;
   for (int i = 1; i < *argc; i++) {
     const std::string arg_string = StreamableToString(argv[i]);
     const char* const arg = arg_string.c_str();
 
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
+    using internal::ParseFlag;
 
     bool remove_flag = false;
     if (ParseGoogleTestFlag(arg)) {
       remove_flag = true;
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
-      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+    } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+      GTEST_FLAG_SET(flagfile, flagfile_value);
+      LoadFlagsFromFile(flagfile_value);
       remove_flag = true;
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
+    } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
       // Both help flag and unrecognized Google Test flags (excluding
       // internal ones) trigger help display.
       g_help_flag = true;
@@ -6605,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+  if (*argc > 0) {
+    // absl::ParseCommandLine() requires *argc > 0.
+    auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+        *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+        absl::flags_internal::UsageFlagsAction::kHandleUsage,
+        absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+    // Any command-line positional arguments not part of any command-line flag
+    // (or arguments to a flag) are copied back out to argv, with the program
+    // invocation name at position 0, and argc is resized. This includes
+    // positional arguments after the flag-terminating delimiter '--'.
+    // See https://abseil.io/docs/cpp/guides/flags.
+    std::copy(positional_args.begin(), positional_args.end(), argv);
+    if (static_cast<int>(positional_args.size()) < *argc) {
+      argv[positional_args.size()] = nullptr;
+      *argc = static_cast<int>(positional_args.size());
+    }
+  }
+#else
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
 
   // Fix the value of *_NSGetArgc() on macOS, but if and only if
   // *_NSGetArgv() == argv
@@ -6640,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 
 #if GTEST_HAS_ABSL
   absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+  // When using the Abseil Flags library, set the program usage message to the
+  // help message, but remove the color-encoding from the message first.
+  absl::SetProgramUsageMessage(absl::StrReplaceAll(
+      kColorEncodedHelpMessage,
+      {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
 #endif  // GTEST_HAS_ABSL
 
   ParseGoogleTestFlagsOnly(argc, argv);
@@ -6660,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 void InitGoogleTest(int* argc, char** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6670,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) {
 void InitGoogleTest(int* argc, wchar_t** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6686,42 +6736,42 @@ void InitGoogleTest() {
 
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(&argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+    std::initializer_list<const char*> environment_variables,
+    const char* fallback, char separator) {
+  for (const char* variable_name : environment_variables) {
+    const char* value = internal::posix::GetEnv(variable_name);
+    if (value != nullptr && value[0] != '\0') {
+      if (value[strlen(value) - 1] != separator) {
+        return std::string(value).append(1, separator);
+      }
+      return value;
+    }
+  }
+  return fallback;
+}
+#endif
+
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#elif GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "\\temp\\";
-  } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
-    return temp_dir;
-  } else {
-    return std::string(temp_dir) + "\\";
-  }
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
 #elif GTEST_OS_LINUX_ANDROID
-  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "/data/local/tmp/";
-  } else {
-    return temp_dir;
-  }
-#elif GTEST_OS_LINUX
-  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "/tmp/";
-  } else {
-    return temp_dir;
-  }
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
 #else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
 }
 
 // Class ScopedTrace
@@ -6738,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
 }
 
 // Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   UnitTest::GetInstance()->PopGTestTrace();
 }
 
diff --git a/third_party/googletest/src/googletest/src/gtest_main.cc b/third_party/googletest/src/googletest/src/gtest_main.cc
index 46b27c3d7..44976375c 100644
--- a/third_party/googletest/src/googletest/src/gtest_main.cc
+++ b/third_party/googletest/src/googletest/src/gtest_main.cc
@@ -28,15 +28,14 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cstdio>
+
 #include "gtest/gtest.h"
 
 #if GTEST_OS_ESP8266 || GTEST_OS_ESP32
 #if GTEST_OS_ESP8266
 extern "C" {
 #endif
-void setup() {
-  testing::InitGoogleTest();
-}
+void setup() { testing::InitGoogleTest(); }
 
 void loop() { RUN_ALL_TESTS(); }
 
diff --git a/tools/dump_obu.cc b/tools/dump_obu.cc
index 30ee5e7a1..b9ff985c4 100644
--- a/tools/dump_obu.cc
+++ b/tools/dump_obu.cc
@@ -54,11 +54,15 @@ void PrintUsage() {
 }
 
 VideoFileType GetFileType(InputContext *ctx) {
-  if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF;
-  if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU;
+  // TODO(https://crbug.com/aomedia/1706): webm type does not support reading
+  // from stdin yet, and file_is_webm is not using the detect buffer when
+  // determining the type. Therefore it should only be checked when using a file
+  // and needs to be checked prior to other types.
 #if CONFIG_WEBM_IO
   if (file_is_webm(ctx->webm_ctx, ctx->avx_ctx)) return FILE_TYPE_WEBM;
 #endif
+  if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF;
+  if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU;
   return FILE_TYPE_RAW;
 }
 
@@ -66,7 +70,7 @@ bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) {
   const VideoFileType file_type = ctx->avx_ctx->file_type;
   switch (file_type) {
     case FILE_TYPE_IVF: {
-      if (ivf_read_frame(ctx->avx_ctx->file, &ctx->unit_buffer, unit_size,
+      if (ivf_read_frame(ctx->avx_ctx, &ctx->unit_buffer, unit_size,
                          &ctx->unit_buffer_size, NULL)) {
         return false;
       }
author	James Zern <jzern@google.com>	2023-02-11 01:59:28 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2023-02-11 01:59:28 +0000
commit	bf1dc73f9c37298342c6de1c19946eed77bc9966 (patch)
tree	df63e8c8d3c192afa6e70fa85ab493f9b3ccfc81
parent	b66a35fa1eb0066c89540864ad0932f2fb01a712 (diff)
parent	4ee09a1eb97fed5af3918f17d2ebfb0d8d0ec790 (diff)
download	libaom-bf1dc73f9c37298342c6de1c19946eed77bc9966.tar.gz