501 files changed, 35545 insertions, 11725 deletions
diff --git a/third_party/abseil-cpp/CMake/AbseilDll.cmake b/third_party/abseil-cpp/CMake/AbseilDll.cmake
index 39f85f2ffd..8ee4120f69 100644
--- a/third_party/abseil-cpp/CMake/AbseilDll.cmake
+++ b/third_party/abseil-cpp/CMake/AbseilDll.cmake
@@ -1,4 +1,5 @@
 include(CMakeParseArguments)
+include(GNUInstallDirs)
 
 set(ABSL_INTERNAL_DLL_FILES
   "algorithm/algorithm.h"
@@ -196,16 +197,27 @@ set(ABSL_INTERNAL_DLL_FILES
   "strings/cord.h"
   "strings/escaping.cc"
   "strings/escaping.h"
+  "strings/internal/charconv_bigint.cc"
+  "strings/internal/charconv_bigint.h"
+  "strings/internal/charconv_parse.cc"
+  "strings/internal/charconv_parse.h"
   "strings/internal/cord_internal.cc"
   "strings/internal/cord_internal.h"
   "strings/internal/cord_rep_flat.h"
   "strings/internal/cord_rep_ring.cc"
   "strings/internal/cord_rep_ring.h"
   "strings/internal/cord_rep_ring_reader.h"
-  "strings/internal/charconv_bigint.cc"
-  "strings/internal/charconv_bigint.h"
-  "strings/internal/charconv_parse.cc"
-  "strings/internal/charconv_parse.h"
+  "strings/internal/cordz_functions.cc"
+  "strings/internal/cordz_functions.h"
+  "strings/internal/cordz_handle.cc"
+  "strings/internal/cordz_handle.h"
+  "strings/internal/cordz_info.cc"
+  "strings/internal/cordz_info.h"
+  "strings/internal/cordz_sample_token.cc"
+  "strings/internal/cordz_sample_token.h"
+  "strings/internal/cordz_statistics.h"
+  "strings/internal/cordz_update_scope.h"
+  "strings/internal/cordz_update_tracker.h"
   "strings/internal/stl_type_traits.h"
   "strings/internal/string_constant.h"
   "strings/match.cc"
@@ -500,7 +512,7 @@ function(absl_make_dll)
     abseil_dll
     PUBLIC
       "$<BUILD_INTERFACE:${ABSL_COMMON_INCLUDE_DIRS}>"
-      $<INSTALL_INTERFACE:${ABSL_INSTALL_INCLUDEDIR}>
+      $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
   )
 
   target_compile_options(
@@ -518,8 +530,8 @@ function(absl_make_dll)
       ${ABSL_CC_LIB_DEFINES}
   )
   install(TARGETS abseil_dll EXPORT ${PROJECT_NAME}Targets
-        RUNTIME DESTINATION ${ABSL_INSTALL_BINDIR}
-        LIBRARY DESTINATION ${ABSL_INSTALL_LIBDIR}
-        ARCHIVE DESTINATION ${ABSL_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endfunction()
diff --git a/third_party/abseil-cpp/CMake/AbseilHelpers.cmake b/third_party/abseil-cpp/CMake/AbseilHelpers.cmake
index 8502c02c9a..6a64a2c788 100644
--- a/third_party/abseil-cpp/CMake/AbseilHelpers.cmake
+++ b/third_party/abseil-cpp/CMake/AbseilHelpers.cmake
@@ -17,7 +17,6 @@
 include(CMakeParseArguments)
 include(AbseilConfigureCopts)
 include(AbseilDll)
-include(AbseilInstallDirs)
 
 # The IDE folder for Abseil that will be used if Abseil is included in a CMake
 # project that sets
@@ -142,7 +141,8 @@ function(absl_cc_library)
   endif()
 
   # Generate a pkg-config file for every library:
-  if((_build_type STREQUAL "static" OR _build_type STREQUAL "shared") AND ABSL_ENABLE_INSTALL)
+  if((_build_type STREQUAL "static" OR _build_type STREQUAL "shared")
+     AND ABSL_ENABLE_INSTALL)
     if(NOT ABSL_CC_LIB_TESTONLY)
       if(absl_VERSION)
         set(PC_VERSION "${absl_VERSION}")
@@ -151,6 +151,10 @@ function(absl_cc_library)
       endif()
       foreach(dep ${ABSL_CC_LIB_DEPS})
         if(${dep} MATCHES "^absl::(.*)")
+	  # Join deps with commas.
+          if(PC_DEPS)
+            set(PC_DEPS "${PC_DEPS},")
+          endif()
           set(PC_DEPS "${PC_DEPS} absl_${CMAKE_MATCH_1} = ${PC_VERSION}")
         endif()
       endforeach()
@@ -167,18 +171,18 @@ function(absl_cc_library)
       FILE(GENERATE OUTPUT "${CMAKE_BINARY_DIR}/lib/pkgconfig/absl_${_NAME}.pc" CONTENT "\
 prefix=${CMAKE_INSTALL_PREFIX}\n\
 exec_prefix=\${prefix}\n\
-libdir=\${prefix}/lib\n\
-includedir=\${prefix}/include\n\
+libdir=${CMAKE_INSTALL_FULL_LIBDIR}\n\
+includedir=${CMAKE_INSTALL_FULL_INCLUDEDIR}\n\
 \n\
 Name: absl_${_NAME}\n\
 Description: Abseil ${_NAME} library\n\
 URL: https://abseil.io/\n\
 Version: ${PC_VERSION}\n\
-Requires.private:${PC_DEPS}\n\
+Requires:${PC_DEPS}\n\
 Libs: -L\${libdir} $<JOIN:${ABSL_CC_LIB_LINKOPTS}, > $<$<NOT:$<BOOL:${ABSL_CC_LIB_IS_INTERFACE}>>:-labsl_${_NAME}>\n\
 Cflags: -I\${includedir}${PC_CFLAGS}\n")
       INSTALL(FILES "${CMAKE_BINARY_DIR}/lib/pkgconfig/absl_${_NAME}.pc"
-              DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
+              DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
     endif()
   endif()
 
@@ -235,7 +239,7 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n")
     target_include_directories(${_NAME}
       PUBLIC
         "$<BUILD_INTERFACE:${ABSL_COMMON_INCLUDE_DIRS}>"
-        $<INSTALL_INTERFACE:${ABSL_INSTALL_INCLUDEDIR}>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
     )
     target_compile_options(${_NAME}
       PRIVATE ${ABSL_CC_LIB_COPTS})
@@ -260,7 +264,6 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n")
     if(ABSL_ENABLE_INSTALL)
       set_target_properties(${_NAME} PROPERTIES
         OUTPUT_NAME "absl_${_NAME}"
-        # TODO(b/173696973): Figure out how to set SOVERSION for LTS releases.
         SOVERSION 0
       )
     endif()
@@ -270,7 +273,7 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n")
     target_include_directories(${_NAME}
       INTERFACE
         "$<BUILD_INTERFACE:${ABSL_COMMON_INCLUDE_DIRS}>"
-        $<INSTALL_INTERFACE:${ABSL_INSTALL_INCLUDEDIR}>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
       )
 
     if (_build_type STREQUAL "dll")
@@ -290,9 +293,9 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n")
   # installed abseil can't be tested.
   if(NOT ABSL_CC_LIB_TESTONLY AND ABSL_ENABLE_INSTALL)
     install(TARGETS ${_NAME} EXPORT ${PROJECT_NAME}Targets
-          RUNTIME DESTINATION ${ABSL_INSTALL_BINDIR}
-          LIBRARY DESTINATION ${ABSL_INSTALL_LIBDIR}
-          ARCHIVE DESTINATION ${ABSL_INSTALL_LIBDIR}
+          RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+          LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     )
   endif()
 
@@ -333,8 +336,8 @@ endfunction()
 #     "awesome_test.cc"
 #   DEPS
 #     absl::awesome
-#     gmock
-#     gtest_main
+#     GTest::gmock
+#     GTest::gtest_main
 # )
 function(absl_cc_test)
   if(NOT BUILD_TESTING)
diff --git a/third_party/abseil-cpp/CMake/AbseilInstallDirs.cmake b/third_party/abseil-cpp/CMake/AbseilInstallDirs.cmake
deleted file mode 100644
index 6fc914b60f..0000000000
--- a/third_party/abseil-cpp/CMake/AbseilInstallDirs.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-include(GNUInstallDirs)
-
-# absl_VERSION is only set if we are an LTS release being installed, in which
-# case it may be into a system directory and so we need to make subdirectories
-# for each installed version of Abseil.  This mechanism is implemented in
-# Abseil's internal Copybara (https://github.com/google/copybara) workflows and
-# isn't visible in the CMake buildsystem itself.
-
-if(absl_VERSION)
-  set(ABSL_SUBDIR "${PROJECT_NAME}_${PROJECT_VERSION}")
-  set(ABSL_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}/${ABSL_SUBDIR}")
-  set(ABSL_INSTALL_CONFIGDIR "${CMAKE_INSTALL_LIBDIR}/cmake/${ABSL_SUBDIR}")
-  set(ABSL_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/${ABSL_SUBDIR}")
-  set(ABSL_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}/${ABSL_SUBDIR}")
-else()
-  set(ABSL_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}")
-  set(ABSL_INSTALL_CONFIGDIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
-  set(ABSL_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}")
-  set(ABSL_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}")
-endif()
diff --git a/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt b/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt
index 06b797e9ed..eebfe617a4 100644
--- a/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt
+++ b/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt
@@ -18,8 +18,6 @@
 cmake_minimum_required(VERSION 3.5)
 project(absl_cmake_testing CXX)
 
-set(CMAKE_CXX_STANDARD 11)
-
 add_executable(simple simple.cc)
 
 find_package(absl REQUIRED)
diff --git a/third_party/abseil-cpp/CMakeLists.txt b/third_party/abseil-cpp/CMakeLists.txt
index e68810e3cf..42bcbe100b 100644
--- a/third_party/abseil-cpp/CMakeLists.txt
+++ b/third_party/abseil-cpp/CMakeLists.txt
@@ -41,11 +41,16 @@ if (POLICY CMP0077)
   cmake_policy(SET CMP0077 NEW)
 endif (POLICY CMP0077)
 
+# Allow the user to specify the MSVC runtime
+if (POLICY CMP0091)
+  cmake_policy(SET CMP0091 NEW)
+endif (POLICY CMP0091)
+
 # Set BUILD_TESTING to OFF by default.
 # This must come before the project() and include(CTest) lines.
 OPTION(BUILD_TESTING "Build tests" OFF)
 
-project(absl CXX)
+project(absl LANGUAGES CXX)
 include(CTest)
 
 # Output directory is correct by default for most build setups. However, when
@@ -67,8 +72,8 @@ list(APPEND CMAKE_MODULE_PATH
   ${CMAKE_CURRENT_LIST_DIR}/absl/copts
 )
 
-include(AbseilInstallDirs)
 include(CMakePackageConfigHelpers)
+include(GNUInstallDirs)
 include(AbseilDll)
 include(AbseilHelpers)
 
@@ -97,9 +102,18 @@ endif()
 ## pthread
 find_package(Threads REQUIRED)
 
+include(CMakeDependentOption)
+
 option(ABSL_USE_EXTERNAL_GOOGLETEST
   "If ON, Abseil will assume that the targets for GoogleTest are already provided by the including project. This makes sense when Abseil is used with add_subproject." OFF)
 
+cmake_dependent_option(ABSL_FIND_GOOGLETEST
+  "If ON, Abseil will use find_package(GTest) rather than assuming that GoogleTest is already provided by the including project."
+  ON
+  "ABSL_USE_EXTERNAL_GOOGLETEST"
+  OFF)
+
+
 option(ABSL_USE_GOOGLETEST_HEAD
   "If ON, abseil will download HEAD from GoogleTest at config time." OFF)
 
@@ -111,7 +125,15 @@ set(ABSL_LOCAL_GOOGLETEST_DIR "/usr/src/googletest" CACHE PATH
 
 if(BUILD_TESTING)
   ## check targets
-  if (NOT ABSL_USE_EXTERNAL_GOOGLETEST)
+  if (ABSL_USE_EXTERNAL_GOOGLETEST)
+    if (ABSL_FIND_GOOGLETEST)
+      find_package(GTest REQUIRED)
+    else()
+      if (NOT TARGET gtest AND NOT TARGET GTest::gtest)
+        message(FATAL_ERROR "ABSL_USE_EXTERNAL_GOOGLETEST is ON and ABSL_FIND_GOOGLETEST is OFF, which means that the top-level project must build the Google Test project. However, the target gtest was not found.")
+      endif()
+    endif()
+  else()
     set(absl_gtest_build_dir ${CMAKE_BINARY_DIR}/googletest-build)
     if(ABSL_USE_GOOGLETEST_HEAD AND ABSL_GOOGLETEST_DOWNLOAD_URL)
       message(FATAL_ERROR "Do not set both ABSL_USE_GOOGLETEST_HEAD and ABSL_GOOGLETEST_DOWNLOAD_URL")
@@ -129,14 +151,22 @@ if(BUILD_TESTING)
     include(CMake/Googletest/DownloadGTest.cmake)
   endif()
 
-  check_target(gtest)
-  check_target(gtest_main)
-  check_target(gmock)
+  if (NOT ABSL_FIND_GOOGLETEST)
+    # When Google Test is included directly rather than through find_package, the aliases are missing.
+    add_library(GTest::gtest_main ALIAS gtest_main)
+    add_library(GTest::gtest ALIAS gtest)
+    add_library(GTest::gmock ALIAS gmock)
+  endif()
+
+  check_target(GTest::gtest)
+  check_target(GTest::gtest_main)
+  check_target(GTest::gmock)
+  check_target(GTest::gmock_main)
 
   list(APPEND ABSL_TEST_COMMON_LIBRARIES
-    gtest_main
-    gtest
-    gmock
+    GTest::gtest_main
+    GTest::gtest
+    GTest::gmock
     ${CMAKE_THREAD_LIBS_INIT}
   )
 endif()
@@ -144,7 +174,6 @@ endif()
 add_subdirectory(absl)
 
 if(ABSL_ENABLE_INSTALL)
-   message(FATAL_ERROR "Please do not install abseil")
   # absl:lts-remove-begin(system installation is supported for LTS releases)
   # We don't support system-wide installation
   list(APPEND SYSTEM_INSTALL_DIRS "/usr/local" "/usr" "/opt/" "/opt/local" "c:/Program Files/${PROJECT_NAME}")
@@ -160,16 +189,16 @@ if(ABSL_ENABLE_INSTALL)
   # install as a subdirectory only
   install(EXPORT ${PROJECT_NAME}Targets
     NAMESPACE absl::
-    DESTINATION "${ABSL_INSTALL_CONFIGDIR}"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
   )
 
   configure_package_config_file(
     CMake/abslConfig.cmake.in
     "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
-    INSTALL_DESTINATION "${ABSL_INSTALL_CONFIGDIR}"
+    INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
   )
   install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
-    DESTINATION "${ABSL_INSTALL_CONFIGDIR}"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
   )
 
   # Abseil only has a version in LTS releases.  This mechanism is accomplished
@@ -182,12 +211,12 @@ if(ABSL_ENABLE_INSTALL)
     )
 
     install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
-      DESTINATION ${ABSL_INSTALL_CONFIGDIR}
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
     )
   endif()  # absl_VERSION
 
   install(DIRECTORY absl
-    DESTINATION ${ABSL_INSTALL_INCLUDEDIR}
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING
       PATTERN "*.inc"
       PATTERN "*.h"
diff --git a/third_party/abseil-cpp/absl/CMakeLists.txt b/third_party/abseil-cpp/absl/CMakeLists.txt
index fbfa7822b5..a41e1eeb35 100644
--- a/third_party/abseil-cpp/absl/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/CMakeLists.txt
@@ -16,6 +16,7 @@
 
 add_subdirectory(base)
 add_subdirectory(algorithm)
+add_subdirectory(cleanup)
 add_subdirectory(container)
 add_subdirectory(debugging)
 add_subdirectory(flags)
diff --git a/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt b/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt
index 56cd0fb85b..609d858946 100644
--- a/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt
@@ -35,7 +35,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::algorithm
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -65,5 +65,5 @@ absl_cc_test(
     absl::core_headers
     absl::memory
     absl::span
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/algorithm/container.h b/third_party/abseil-cpp/absl/algorithm/container.h
index 6398438f08..1652e7b055 100644
--- a/third_party/abseil-cpp/absl/algorithm/container.h
+++ b/third_party/abseil-cpp/absl/algorithm/container.h
@@ -905,11 +905,11 @@ void c_sort(C& c) {
 
 // Overload of c_sort() for performing a `comp` comparison other than the
 // default `operator<`.
-template <typename C, typename Compare>
-void c_sort(C& c, Compare&& comp) {
+template <typename C, typename LessThan>
+void c_sort(C& c, LessThan&& comp) {
   std::sort(container_algorithm_internal::c_begin(c),
             container_algorithm_internal::c_end(c),
-            std::forward<Compare>(comp));
+            std::forward<LessThan>(comp));
 }
 
 // c_stable_sort()
@@ -925,11 +925,11 @@ void c_stable_sort(C& c) {
 
 // Overload of c_stable_sort() for performing a `comp` comparison other than the
 // default `operator<`.
-template <typename C, typename Compare>
-void c_stable_sort(C& c, Compare&& comp) {
+template <typename C, typename LessThan>
+void c_stable_sort(C& c, LessThan&& comp) {
   std::stable_sort(container_algorithm_internal::c_begin(c),
                    container_algorithm_internal::c_end(c),
-                   std::forward<Compare>(comp));
+                   std::forward<LessThan>(comp));
 }
 
 // c_is_sorted()
@@ -944,11 +944,11 @@ bool c_is_sorted(const C& c) {
 
 // c_is_sorted() overload for performing a `comp` comparison other than the
 // default `operator<`.
-template <typename C, typename Compare>
-bool c_is_sorted(const C& c, Compare&& comp) {
+template <typename C, typename LessThan>
+bool c_is_sorted(const C& c, LessThan&& comp) {
   return std::is_sorted(container_algorithm_internal::c_begin(c),
                         container_algorithm_internal::c_end(c),
-                        std::forward<Compare>(comp));
+                        std::forward<LessThan>(comp));
 }
 
 // c_partial_sort()
@@ -966,14 +966,14 @@ void c_partial_sort(
 
 // Overload of c_partial_sort() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename RandomAccessContainer, typename Compare>
+template <typename RandomAccessContainer, typename LessThan>
 void c_partial_sort(
     RandomAccessContainer& sequence,
     container_algorithm_internal::ContainerIter<RandomAccessContainer> middle,
-    Compare&& comp) {
+    LessThan&& comp) {
   std::partial_sort(container_algorithm_internal::c_begin(sequence), middle,
                     container_algorithm_internal::c_end(sequence),
-                    std::forward<Compare>(comp));
+                    std::forward<LessThan>(comp));
 }
 
 // c_partial_sort_copy()
@@ -994,15 +994,15 @@ c_partial_sort_copy(const C& sequence, RandomAccessContainer& result) {
 
 // Overload of c_partial_sort_copy() for performing a `comp` comparison other
 // than the default `operator<`.
-template <typename C, typename RandomAccessContainer, typename Compare>
+template <typename C, typename RandomAccessContainer, typename LessThan>
 container_algorithm_internal::ContainerIter<RandomAccessContainer>
 c_partial_sort_copy(const C& sequence, RandomAccessContainer& result,
-                    Compare&& comp) {
+                    LessThan&& comp) {
   return std::partial_sort_copy(container_algorithm_internal::c_begin(sequence),
                                 container_algorithm_internal::c_end(sequence),
                                 container_algorithm_internal::c_begin(result),
                                 container_algorithm_internal::c_end(result),
-                                std::forward<Compare>(comp));
+                                std::forward<LessThan>(comp));
 }
 
 // c_is_sorted_until()
@@ -1018,12 +1018,12 @@ container_algorithm_internal::ContainerIter<C> c_is_sorted_until(C& c) {
 
 // Overload of c_is_sorted_until() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename C, typename Compare>
+template <typename C, typename LessThan>
 container_algorithm_internal::ContainerIter<C> c_is_sorted_until(
-    C& c, Compare&& comp) {
+    C& c, LessThan&& comp) {
   return std::is_sorted_until(container_algorithm_internal::c_begin(c),
                               container_algorithm_internal::c_end(c),
-                              std::forward<Compare>(comp));
+                              std::forward<LessThan>(comp));
 }
 
 // c_nth_element()
@@ -1043,14 +1043,14 @@ void c_nth_element(
 
 // Overload of c_nth_element() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename RandomAccessContainer, typename Compare>
+template <typename RandomAccessContainer, typename LessThan>
 void c_nth_element(
     RandomAccessContainer& sequence,
     container_algorithm_internal::ContainerIter<RandomAccessContainer> nth,
-    Compare&& comp) {
+    LessThan&& comp) {
   std::nth_element(container_algorithm_internal::c_begin(sequence), nth,
                    container_algorithm_internal::c_end(sequence),
-                   std::forward<Compare>(comp));
+                   std::forward<LessThan>(comp));
 }
 
 //------------------------------------------------------------------------------
@@ -1072,12 +1072,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_lower_bound(
 
 // Overload of c_lower_bound() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename Sequence, typename T, typename Compare>
+template <typename Sequence, typename T, typename LessThan>
 container_algorithm_internal::ContainerIter<Sequence> c_lower_bound(
-    Sequence& sequence, T&& value, Compare&& comp) {
+    Sequence& sequence, T&& value, LessThan&& comp) {
   return std::lower_bound(container_algorithm_internal::c_begin(sequence),
                           container_algorithm_internal::c_end(sequence),
-                          std::forward<T>(value), std::forward<Compare>(comp));
+                          std::forward<T>(value), std::forward<LessThan>(comp));
 }
 
 // c_upper_bound()
@@ -1095,12 +1095,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_upper_bound(
 
 // Overload of c_upper_bound() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename Sequence, typename T, typename Compare>
+template <typename Sequence, typename T, typename LessThan>
 container_algorithm_internal::ContainerIter<Sequence> c_upper_bound(
-    Sequence& sequence, T&& value, Compare&& comp) {
+    Sequence& sequence, T&& value, LessThan&& comp) {
   return std::upper_bound(container_algorithm_internal::c_begin(sequence),
                           container_algorithm_internal::c_end(sequence),
-                          std::forward<T>(value), std::forward<Compare>(comp));
+                          std::forward<T>(value), std::forward<LessThan>(comp));
 }
 
 // c_equal_range()
@@ -1118,12 +1118,12 @@ c_equal_range(Sequence& sequence, T&& value) {
 
 // Overload of c_equal_range() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename Sequence, typename T, typename Compare>
+template <typename Sequence, typename T, typename LessThan>
 container_algorithm_internal::ContainerIterPairType<Sequence, Sequence>
-c_equal_range(Sequence& sequence, T&& value, Compare&& comp) {
+c_equal_range(Sequence& sequence, T&& value, LessThan&& comp) {
   return std::equal_range(container_algorithm_internal::c_begin(sequence),
                           container_algorithm_internal::c_end(sequence),
-                          std::forward<T>(value), std::forward<Compare>(comp));
+                          std::forward<T>(value), std::forward<LessThan>(comp));
 }
 
 // c_binary_search()
@@ -1140,12 +1140,12 @@ bool c_binary_search(Sequence&& sequence, T&& value) {
 
 // Overload of c_binary_search() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename Sequence, typename T, typename Compare>
-bool c_binary_search(Sequence&& sequence, T&& value, Compare&& comp) {
+template <typename Sequence, typename T, typename LessThan>
+bool c_binary_search(Sequence&& sequence, T&& value, LessThan&& comp) {
   return std::binary_search(container_algorithm_internal::c_begin(sequence),
                             container_algorithm_internal::c_end(sequence),
                             std::forward<T>(value),
-                            std::forward<Compare>(comp));
+                            std::forward<LessThan>(comp));
 }
 
 //------------------------------------------------------------------------------
@@ -1166,14 +1166,14 @@ OutputIterator c_merge(const C1& c1, const C2& c2, OutputIterator result) {
 
 // Overload of c_merge() for performing a `comp` comparison other than
 // the default `operator<`.
-template <typename C1, typename C2, typename OutputIterator, typename Compare>
+template <typename C1, typename C2, typename OutputIterator, typename LessThan>
 OutputIterator c_merge(const C1& c1, const C2& c2, OutputIterator result,
-                       Compare&& comp) {
+                       LessThan&& comp) {
   return std::merge(container_algorithm_internal::c_begin(c1),
                     container_algorithm_internal::c_end(c1),
                     container_algorithm_internal::c_begin(c2),
                     container_algorithm_internal::c_end(c2), result,
-                    std::forward<Compare>(comp));
+                    std::forward<LessThan>(comp));
 }
 
 // c_inplace_merge()
@@ -1189,13 +1189,13 @@ void c_inplace_merge(C& c,
 
 // Overload of c_inplace_merge() for performing a merge using a `comp` other
 // than `operator<`.
-template <typename C, typename Compare>
+template <typename C, typename LessThan>
 void c_inplace_merge(C& c,
                      container_algorithm_internal::ContainerIter<C> middle,
-                     Compare&& comp) {
+                     LessThan&& comp) {
   std::inplace_merge(container_algorithm_internal::c_begin(c), middle,
                      container_algorithm_internal::c_end(c),
-                     std::forward<Compare>(comp));
+                     std::forward<LessThan>(comp));
 }
 
 // c_includes()
@@ -1213,13 +1213,13 @@ bool c_includes(const C1& c1, const C2& c2) {
 
 // Overload of c_includes() for performing a merge using a `comp` other than
 // `operator<`.
-template <typename C1, typename C2, typename Compare>
-bool c_includes(const C1& c1, const C2& c2, Compare&& comp) {
+template <typename C1, typename C2, typename LessThan>
+bool c_includes(const C1& c1, const C2& c2, LessThan&& comp) {
   return std::includes(container_algorithm_internal::c_begin(c1),
                        container_algorithm_internal::c_end(c1),
                        container_algorithm_internal::c_begin(c2),
                        container_algorithm_internal::c_end(c2),
-                       std::forward<Compare>(comp));
+                       std::forward<LessThan>(comp));
 }
 
 // c_set_union()
@@ -1243,7 +1243,7 @@ OutputIterator c_set_union(const C1& c1, const C2& c2, OutputIterator output) {
 
 // Overload of c_set_union() for performing a merge using a `comp` other than
 // `operator<`.
-template <typename C1, typename C2, typename OutputIterator, typename Compare,
+template <typename C1, typename C2, typename OutputIterator, typename LessThan,
           typename = typename std::enable_if<
               !container_algorithm_internal::IsUnorderedContainer<C1>::value,
               void>::type,
@@ -1251,12 +1251,12 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare,
               !container_algorithm_internal::IsUnorderedContainer<C2>::value,
               void>::type>
 OutputIterator c_set_union(const C1& c1, const C2& c2, OutputIterator output,
-                           Compare&& comp) {
+                           LessThan&& comp) {
   return std::set_union(container_algorithm_internal::c_begin(c1),
                         container_algorithm_internal::c_end(c1),
                         container_algorithm_internal::c_begin(c2),
                         container_algorithm_internal::c_end(c2), output,
-                        std::forward<Compare>(comp));
+                        std::forward<LessThan>(comp));
 }
 
 // c_set_intersection()
@@ -1280,7 +1280,7 @@ OutputIterator c_set_intersection(const C1& c1, const C2& c2,
 
 // Overload of c_set_intersection() for performing a merge using a `comp` other
 // than `operator<`.
-template <typename C1, typename C2, typename OutputIterator, typename Compare,
+template <typename C1, typename C2, typename OutputIterator, typename LessThan,
           typename = typename std::enable_if<
               !container_algorithm_internal::IsUnorderedContainer<C1>::value,
               void>::type,
@@ -1288,12 +1288,12 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare,
               !container_algorithm_internal::IsUnorderedContainer<C2>::value,
               void>::type>
 OutputIterator c_set_intersection(const C1& c1, const C2& c2,
-                                  OutputIterator output, Compare&& comp) {
+                                  OutputIterator output, LessThan&& comp) {
   return std::set_intersection(container_algorithm_internal::c_begin(c1),
                                container_algorithm_internal::c_end(c1),
                                container_algorithm_internal::c_begin(c2),
                                container_algorithm_internal::c_end(c2), output,
-                               std::forward<Compare>(comp));
+                               std::forward<LessThan>(comp));
 }
 
 // c_set_difference()
@@ -1318,7 +1318,7 @@ OutputIterator c_set_difference(const C1& c1, const C2& c2,
 
 // Overload of c_set_difference() for performing a merge using a `comp` other
 // than `operator<`.
-template <typename C1, typename C2, typename OutputIterator, typename Compare,
+template <typename C1, typename C2, typename OutputIterator, typename LessThan,
           typename = typename std::enable_if<
               !container_algorithm_internal::IsUnorderedContainer<C1>::value,
               void>::type,
@@ -1326,12 +1326,12 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare,
               !container_algorithm_internal::IsUnorderedContainer<C2>::value,
               void>::type>
 OutputIterator c_set_difference(const C1& c1, const C2& c2,
-                                OutputIterator output, Compare&& comp) {
+                                OutputIterator output, LessThan&& comp) {
   return std::set_difference(container_algorithm_internal::c_begin(c1),
                              container_algorithm_internal::c_end(c1),
                              container_algorithm_internal::c_begin(c2),
                              container_algorithm_internal::c_end(c2), output,
-                             std::forward<Compare>(comp));
+                             std::forward<LessThan>(comp));
 }
 
 // c_set_symmetric_difference()
@@ -1357,7 +1357,7 @@ OutputIterator c_set_symmetric_difference(const C1& c1, const C2& c2,
 
 // Overload of c_set_symmetric_difference() for performing a merge using a
 // `comp` other than `operator<`.
-template <typename C1, typename C2, typename OutputIterator, typename Compare,
+template <typename C1, typename C2, typename OutputIterator, typename LessThan,
           typename = typename std::enable_if<
               !container_algorithm_internal::IsUnorderedContainer<C1>::value,
               void>::type,
@@ -1366,13 +1366,13 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare,
               void>::type>
 OutputIterator c_set_symmetric_difference(const C1& c1, const C2& c2,
                                           OutputIterator output,
-                                          Compare&& comp) {
+                                          LessThan&& comp) {
   return std::set_symmetric_difference(
       container_algorithm_internal::c_begin(c1),
       container_algorithm_internal::c_end(c1),
       container_algorithm_internal::c_begin(c2),
       container_algorithm_internal::c_end(c2), output,
-      std::forward<Compare>(comp));
+      std::forward<LessThan>(comp));
 }
 
 //------------------------------------------------------------------------------
@@ -1391,11 +1391,11 @@ void c_push_heap(RandomAccessContainer& sequence) {
 
 // Overload of c_push_heap() for performing a push operation on a heap using a
 // `comp` other than `operator<`.
-template <typename RandomAccessContainer, typename Compare>
-void c_push_heap(RandomAccessContainer& sequence, Compare&& comp) {
+template <typename RandomAccessContainer, typename LessThan>
+void c_push_heap(RandomAccessContainer& sequence, LessThan&& comp) {
   std::push_heap(container_algorithm_internal::c_begin(sequence),
                  container_algorithm_internal::c_end(sequence),
-                 std::forward<Compare>(comp));
+                 std::forward<LessThan>(comp));
 }
 
 // c_pop_heap()
@@ -1410,11 +1410,11 @@ void c_pop_heap(RandomAccessContainer& sequence) {
 
 // Overload of c_pop_heap() for performing a pop operation on a heap using a
 // `comp` other than `operator<`.
-template <typename RandomAccessContainer, typename Compare>
-void c_pop_heap(RandomAccessContainer& sequence, Compare&& comp) {
+template <typename RandomAccessContainer, typename LessThan>
+void c_pop_heap(RandomAccessContainer& sequence, LessThan&& comp) {
   std::pop_heap(container_algorithm_internal::c_begin(sequence),
                 container_algorithm_internal::c_end(sequence),
-                std::forward<Compare>(comp));
+                std::forward<LessThan>(comp));
 }
 
 // c_make_heap()
@@ -1429,11 +1429,11 @@ void c_make_heap(RandomAccessContainer& sequence) {
 
 // Overload of c_make_heap() for performing heap comparisons using a
 // `comp` other than `operator<`
-template <typename RandomAccessContainer, typename Compare>
-void c_make_heap(RandomAccessContainer& sequence, Compare&& comp) {
+template <typename RandomAccessContainer, typename LessThan>
+void c_make_heap(RandomAccessContainer& sequence, LessThan&& comp) {
   std::make_heap(container_algorithm_internal::c_begin(sequence),
                  container_algorithm_internal::c_end(sequence),
-                 std::forward<Compare>(comp));
+                 std::forward<LessThan>(comp));
 }
 
 // c_sort_heap()
@@ -1448,11 +1448,11 @@ void c_sort_heap(RandomAccessContainer& sequence) {
 
 // Overload of c_sort_heap() for performing heap comparisons using a
 // `comp` other than `operator<`
-template <typename RandomAccessContainer, typename Compare>
-void c_sort_heap(RandomAccessContainer& sequence, Compare&& comp) {
+template <typename RandomAccessContainer, typename LessThan>
+void c_sort_heap(RandomAccessContainer& sequence, LessThan&& comp) {
   std::sort_heap(container_algorithm_internal::c_begin(sequence),
                  container_algorithm_internal::c_end(sequence),
-                 std::forward<Compare>(comp));
+                 std::forward<LessThan>(comp));
 }
 
 // c_is_heap()
@@ -1467,11 +1467,11 @@ bool c_is_heap(const RandomAccessContainer& sequence) {
 
 // Overload of c_is_heap() for performing heap comparisons using a
 // `comp` other than `operator<`
-template <typename RandomAccessContainer, typename Compare>
-bool c_is_heap(const RandomAccessContainer& sequence, Compare&& comp) {
+template <typename RandomAccessContainer, typename LessThan>
+bool c_is_heap(const RandomAccessContainer& sequence, LessThan&& comp) {
   return std::is_heap(container_algorithm_internal::c_begin(sequence),
                       container_algorithm_internal::c_end(sequence),
-                      std::forward<Compare>(comp));
+                      std::forward<LessThan>(comp));
 }
 
 // c_is_heap_until()
@@ -1487,12 +1487,12 @@ c_is_heap_until(RandomAccessContainer& sequence) {
 
 // Overload of c_is_heap_until() for performing heap comparisons using a
 // `comp` other than `operator<`
-template <typename RandomAccessContainer, typename Compare>
+template <typename RandomAccessContainer, typename LessThan>
 container_algorithm_internal::ContainerIter<RandomAccessContainer>
-c_is_heap_until(RandomAccessContainer& sequence, Compare&& comp) {
+c_is_heap_until(RandomAccessContainer& sequence, LessThan&& comp) {
   return std::is_heap_until(container_algorithm_internal::c_begin(sequence),
                             container_algorithm_internal::c_end(sequence),
-                            std::forward<Compare>(comp));
+                            std::forward<LessThan>(comp));
 }
 
 //------------------------------------------------------------------------------
@@ -1513,12 +1513,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_min_element(
 
 // Overload of c_min_element() for performing a `comp` comparison other than
 // `operator<`.
-template <typename Sequence, typename Compare>
+template <typename Sequence, typename LessThan>
 container_algorithm_internal::ContainerIter<Sequence> c_min_element(
-    Sequence& sequence, Compare&& comp) {
+    Sequence& sequence, LessThan&& comp) {
   return std::min_element(container_algorithm_internal::c_begin(sequence),
                           container_algorithm_internal::c_end(sequence),
-                          std::forward<Compare>(comp));
+                          std::forward<LessThan>(comp));
 }
 
 // c_max_element()
@@ -1535,12 +1535,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_max_element(
 
 // Overload of c_max_element() for performing a `comp` comparison other than
 // `operator<`.
-template <typename Sequence, typename Compare>
+template <typename Sequence, typename LessThan>
 container_algorithm_internal::ContainerIter<Sequence> c_max_element(
-    Sequence& sequence, Compare&& comp) {
+    Sequence& sequence, LessThan&& comp) {
   return std::max_element(container_algorithm_internal::c_begin(sequence),
                           container_algorithm_internal::c_end(sequence),
-                          std::forward<Compare>(comp));
+                          std::forward<LessThan>(comp));
 }
 
 // c_minmax_element()
@@ -1558,12 +1558,12 @@ c_minmax_element(C& c) {
 
 // Overload of c_minmax_element() for performing `comp` comparisons other than
 // `operator<`.
-template <typename C, typename Compare>
+template <typename C, typename LessThan>
 container_algorithm_internal::ContainerIterPairType<C, C>
-c_minmax_element(C& c, Compare&& comp) {
+c_minmax_element(C& c, LessThan&& comp) {
   return std::minmax_element(container_algorithm_internal::c_begin(c),
                              container_algorithm_internal::c_end(c),
-                             std::forward<Compare>(comp));
+                             std::forward<LessThan>(comp));
 }
 
 //------------------------------------------------------------------------------
@@ -1588,15 +1588,15 @@ bool c_lexicographical_compare(Sequence1&& sequence1, Sequence2&& sequence2) {
 
 // Overload of c_lexicographical_compare() for performing a lexicographical
 // comparison using a `comp` operator instead of `operator<`.
-template <typename Sequence1, typename Sequence2, typename Compare>
+template <typename Sequence1, typename Sequence2, typename LessThan>
 bool c_lexicographical_compare(Sequence1&& sequence1, Sequence2&& sequence2,
-                               Compare&& comp) {
+                               LessThan&& comp) {
   return std::lexicographical_compare(
       container_algorithm_internal::c_begin(sequence1),
       container_algorithm_internal::c_end(sequence1),
       container_algorithm_internal::c_begin(sequence2),
       container_algorithm_internal::c_end(sequence2),
-      std::forward<Compare>(comp));
+      std::forward<LessThan>(comp));
 }
 
 // c_next_permutation()
@@ -1612,11 +1612,11 @@ bool c_next_permutation(C& c) {
 
 // Overload of c_next_permutation() for performing a lexicographical
 // comparison using a `comp` operator instead of `operator<`.
-template <typename C, typename Compare>
-bool c_next_permutation(C& c, Compare&& comp) {
+template <typename C, typename LessThan>
+bool c_next_permutation(C& c, LessThan&& comp) {
   return std::next_permutation(container_algorithm_internal::c_begin(c),
                                container_algorithm_internal::c_end(c),
-                               std::forward<Compare>(comp));
+                               std::forward<LessThan>(comp));
 }
 
 // c_prev_permutation()
@@ -1632,11 +1632,11 @@ bool c_prev_permutation(C& c) {
 
 // Overload of c_prev_permutation() for performing a lexicographical
 // comparison using a `comp` operator instead of `operator<`.
-template <typename C, typename Compare>
-bool c_prev_permutation(C& c, Compare&& comp) {
+template <typename C, typename LessThan>
+bool c_prev_permutation(C& c, LessThan&& comp) {
   return std::prev_permutation(container_algorithm_internal::c_begin(c),
                                container_algorithm_internal::c_end(c),
-                               std::forward<Compare>(comp));
+                               std::forward<LessThan>(comp));
 }
 
 //------------------------------------------------------------------------------
diff --git a/third_party/abseil-cpp/absl/base/CMakeLists.txt b/third_party/abseil-cpp/absl/base/CMakeLists.txt
index 981b8cc008..7d56aa1346 100644
--- a/third_party/abseil-cpp/absl/base/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/base/CMakeLists.txt
@@ -230,7 +230,7 @@ absl_cc_library(
     ${ABSL_DEFAULT_COPTS}
   DEPS
     absl::config
-    gtest
+    GTest::gtest
   TESTONLY
 )
 
@@ -259,7 +259,7 @@ absl_cc_library(
     absl::meta
     absl::strings
     absl::utility
-    gtest
+    GTest::gtest
   TESTONLY
 )
 
@@ -273,7 +273,7 @@ absl_cc_test(
   DEPS
     absl::exception_safety_testing
     absl::memory
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -300,8 +300,8 @@ absl_cc_test(
     absl::atomic_hook_test_helper
     absl::atomic_hook
     absl::core_headers
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -314,7 +314,7 @@ absl_cc_test(
   DEPS
     absl::base
     absl::core_headers
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -327,8 +327,8 @@ absl_cc_test(
   DEPS
     absl::errno_saver
     absl::strerror
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -342,7 +342,7 @@ absl_cc_test(
     absl::base
     absl::config
     absl::throw_delegate
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -357,7 +357,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::base_internal
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -371,8 +371,8 @@ absl_cc_test(
     absl::base_internal
     absl::memory
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -388,7 +388,7 @@ absl_cc_library(
     absl::base_internal
     absl::core_headers
     absl::synchronization
-    gtest
+    GTest::gtest
   TESTONLY
 )
 
@@ -406,7 +406,7 @@ absl_cc_test(
     absl::config
     absl::core_headers
     absl::synchronization
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -435,7 +435,7 @@ absl_cc_test(
     absl::base
     absl::config
     absl::endian
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -448,7 +448,7 @@ absl_cc_test(
   DEPS
     absl::config
     absl::synchronization
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -462,7 +462,7 @@ absl_cc_test(
     absl::base
     absl::core_headers
     absl::synchronization
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -475,7 +475,7 @@ absl_cc_test(
   DEPS
     absl::raw_logging_internal
     absl::strings
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -488,7 +488,7 @@ absl_cc_test(
   DEPS
     absl::base
     absl::synchronization
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -516,7 +516,7 @@ absl_cc_test(
     absl::core_headers
     absl::synchronization
     Threads::Threads
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -543,7 +543,7 @@ absl_cc_test(
   DEPS
     absl::exponential_biased
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -570,7 +570,7 @@ absl_cc_test(
   DEPS
     absl::core_headers
     absl::periodic_sampler
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -596,7 +596,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::scoped_set_env
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -620,8 +620,8 @@ absl_cc_test(
     absl::flags_marshalling
     absl::log_severity
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -651,8 +651,8 @@ absl_cc_test(
   DEPS
     absl::strerror
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -677,7 +677,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::fast_type_id
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -690,5 +690,5 @@ absl_cc_test(
   DEPS
     absl::core_headers
     absl::optional
-    gtest_main
+    GTest::gtest_main
 )
diff --git a/third_party/abseil-cpp/absl/base/attributes.h b/third_party/abseil-cpp/absl/base/attributes.h
index cf2cb5501e..52139556f2 100644
--- a/third_party/abseil-cpp/absl/base/attributes.h
+++ b/third_party/abseil-cpp/absl/base/attributes.h
@@ -131,14 +131,14 @@
 // ABSL_ATTRIBUTE_WEAK
 //
 // Tags a function as weak for the purposes of compilation and linking.
-// Weak attributes currently do not work properly in LLVM's Windows backend,
-// so disable them there. See https://bugs.llvm.org/show_bug.cgi?id=37598
+// Weak attributes did not work properly in LLVM's Windows backend before
+// 9.0.0, so disable them there. See https://bugs.llvm.org/show_bug.cgi?id=37598
 // for further information.
 // The MinGW compiler doesn't complain about the weak attribute until the link
 // step, presumably because Windows doesn't use ELF binaries.
 #if (ABSL_HAVE_ATTRIBUTE(weak) ||                   \
      (defined(__GNUC__) && !defined(__clang__))) && \
-    !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__)
+    (!defined(_WIN32) || __clang_major__ < 9) && !defined(__MINGW32__)
 #undef ABSL_ATTRIBUTE_WEAK
 #define ABSL_ATTRIBUTE_WEAK __attribute__((weak))
 #define ABSL_HAVE_ATTRIBUTE_WEAK 1
@@ -281,10 +281,7 @@
 // ABSL_ATTRIBUTE_RETURNS_NONNULL
 //
 // Tells the compiler that a particular function never returns a null pointer.
-#if ABSL_HAVE_ATTRIBUTE(returns_nonnull) || \
-    (defined(__GNUC__) && \
-     (__GNUC__ > 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
-     !defined(__clang__))
+#if ABSL_HAVE_ATTRIBUTE(returns_nonnull)
 #define ABSL_ATTRIBUTE_RETURNS_NONNULL __attribute__((returns_nonnull))
 #else
 #define ABSL_ATTRIBUTE_RETURNS_NONNULL
@@ -524,6 +521,13 @@
 // ABSL_ATTRIBUTE_UNUSED
 //
 // Prevents the compiler from complaining about variables that appear unused.
+//
+// For code or headers that are assured to only build with C++17 and up, prefer
+// just using the standard '[[maybe_unused]]' directly over this macro.
+//
+// Due to differences in positioning requirements between the old, compiler
+// specific __attribute__ syntax and the now standard [[maybe_unused]], this
+// macro does not attempt to take advantage of '[[maybe_unused]]'.
 #if ABSL_HAVE_ATTRIBUTE(unused) || (defined(__GNUC__) && !defined(__clang__))
 #undef ABSL_ATTRIBUTE_UNUSED
 #define ABSL_ATTRIBUTE_UNUSED __attribute__((__unused__))
@@ -595,31 +599,24 @@
 //    case 42:
 //      ...
 //
-// Notes: when compiled with clang in C++11 mode, the ABSL_FALLTHROUGH_INTENDED
-// macro is expanded to the [[clang::fallthrough]] attribute, which is analysed
-// when  performing switch labels fall-through diagnostic
-// (`-Wimplicit-fallthrough`). See clang documentation on language extensions
-// for details:
+// Notes: When supported, GCC and Clang can issue a warning on switch labels
+// with unannotated fallthrough using the warning `-Wimplicit-fallthrough`. See
+// clang documentation on language extensions for details:
 // https://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough
 //
-// When used with unsupported compilers, the ABSL_FALLTHROUGH_INTENDED macro
-// has no effect on diagnostics. In any case this macro has no effect on runtime
+// When used with unsupported compilers, the ABSL_FALLTHROUGH_INTENDED macro has
+// no effect on diagnostics. In any case this macro has no effect on runtime
 // behavior and performance of code.
 
 #ifdef ABSL_FALLTHROUGH_INTENDED
 #error "ABSL_FALLTHROUGH_INTENDED should not be defined."
-#endif
-
-// TODO(zhangxy): Use c++17 standard [[fallthrough]] macro, when supported.
-#if defined(__clang__) && defined(__has_warning)
-#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#elif ABSL_HAVE_CPP_ATTRIBUTE(fallthrough)
+#define ABSL_FALLTHROUGH_INTENDED [[fallthrough]]
+#elif ABSL_HAVE_CPP_ATTRIBUTE(clang::fallthrough)
 #define ABSL_FALLTHROUGH_INTENDED [[clang::fallthrough]]
-#endif
-#elif defined(__GNUC__) && __GNUC__ >= 7
+#elif ABSL_HAVE_CPP_ATTRIBUTE(gnu::fallthrough)
 #define ABSL_FALLTHROUGH_INTENDED [[gnu::fallthrough]]
-#endif
-
-#ifndef ABSL_FALLTHROUGH_INTENDED
+#else
 #define ABSL_FALLTHROUGH_INTENDED \
   do {                            \
   } while (0)
@@ -699,4 +696,26 @@
 #define ABSL_ATTRIBUTE_PURE_FUNCTION
 #endif
 
+// ABSL_ATTRIBUTE_LIFETIME_BOUND indicates that a resource owned by a function
+// parameter or implicit object parameter is retained by the return value of the
+// annotated function (or, for a parameter of a constructor, in the value of the
+// constructed object). This attribute causes warnings to be produced if a
+// temporary object does not live long enough.
+//
+// When applied to a reference parameter, the referenced object is assumed to be
+// retained by the return value of the function. When applied to a non-reference
+// parameter (for example, a pointer or a class type), all temporaries
+// referenced by the parameter are assumed to be retained by the return value of
+// the function.
+//
+// See also the upstream documentation:
+// https://clang.llvm.org/docs/AttributeReference.html#lifetimebound
+#if ABSL_HAVE_CPP_ATTRIBUTE(clang::lifetimebound)
+#define ABSL_ATTRIBUTE_LIFETIME_BOUND [[clang::lifetimebound]]
+#elif ABSL_HAVE_ATTRIBUTE(lifetimebound)
+#define ABSL_ATTRIBUTE_LIFETIME_BOUND __attribute__((lifetimebound))
+#else
+#define ABSL_ATTRIBUTE_LIFETIME_BOUND
+#endif
+
 #endif  // ABSL_BASE_ATTRIBUTES_H_
diff --git a/third_party/abseil-cpp/absl/base/config.h b/third_party/abseil-cpp/absl/base/config.h
index 95449969e7..0524196d56 100644
--- a/third_party/abseil-cpp/absl/base/config.h
+++ b/third_party/abseil-cpp/absl/base/config.h
@@ -166,6 +166,22 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 #define ABSL_HAVE_FEATURE(f) 0
 #endif
 
+// Portable check for GCC minimum version:
+// https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+#define ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(x, y) \
+  (__GNUC__ > (x) || __GNUC__ == (x) && __GNUC_MINOR__ >= (y))
+#else
+#define ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(x, y) 0
+#endif
+
+#if defined(__clang__) && defined(__clang_major__) && defined(__clang_minor__)
+#define ABSL_INTERNAL_HAVE_MIN_CLANG_VERSION(x, y) \
+  (__clang_major__ > (x) || __clang_major__ == (x) && __clang_minor__ >= (y))
+#else
+#define ABSL_INTERNAL_HAVE_MIN_CLANG_VERSION(x, y) 0
+#endif
+
 // ABSL_HAVE_TLS is defined to 1 when __thread should be supported.
 // We assume __thread is supported on Linux when compiled with Clang or compiled
 // against libstdc++ with _GLIBCXX_HAVE_TLS defined.
@@ -183,10 +199,9 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 // gcc >= 4.8.1 using libstdc++, and Visual Studio.
 #ifdef ABSL_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE
 #error ABSL_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE cannot be directly set
-#elif defined(_LIBCPP_VERSION) ||                                        \
-    (!defined(__clang__) && defined(__GNUC__) && defined(__GLIBCXX__) && \
-     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) ||        \
-    defined(_MSC_VER)
+#elif defined(_LIBCPP_VERSION) || defined(_MSC_VER) || \
+    (!defined(__clang__) && defined(__GLIBCXX__) &&    \
+     ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(4, 8))
 #define ABSL_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE 1
 #endif
 
@@ -205,10 +220,9 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 #error ABSL_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE cannot be directly set
 #elif defined(ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE)
 #error ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE cannot directly set
-#elif (defined(__clang__) && defined(_LIBCPP_VERSION)) ||        \
-    (!defined(__clang__) && defined(__GNUC__) &&                 \
-     (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 4)) && \
-     (defined(_LIBCPP_VERSION) || defined(__GLIBCXX__))) ||      \
+#elif (defined(__clang__) && defined(_LIBCPP_VERSION)) ||                \
+    (!defined(__clang__) && ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(7, 4) && \
+     (defined(_LIBCPP_VERSION) || defined(__GLIBCXX__))) ||              \
     (defined(_MSC_VER) && !defined(__NVCC__))
 #define ABSL_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE 1
 #define ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE 1
@@ -222,7 +236,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 #if ABSL_INTERNAL_HAS_KEYWORD(__builtin_LINE) && \
     ABSL_INTERNAL_HAS_KEYWORD(__builtin_FILE)
 #define ABSL_HAVE_SOURCE_LOCATION_CURRENT 1
-#elif defined(__GNUC__) && __GNUC__ >= 5
+#elif ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(5, 0)
 #define ABSL_HAVE_SOURCE_LOCATION_CURRENT 1
 #endif
 #endif
@@ -319,25 +333,21 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 // For further details, consult the compiler's documentation.
 #ifdef ABSL_HAVE_EXCEPTIONS
 #error ABSL_HAVE_EXCEPTIONS cannot be directly set.
-
-#elif defined(__clang__)
-
-#if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 6)
+#elif ABSL_INTERNAL_HAVE_MIN_CLANG_VERSION(3, 6)
 // Clang >= 3.6
 #if ABSL_HAVE_FEATURE(cxx_exceptions)
 #define ABSL_HAVE_EXCEPTIONS 1
 #endif  // ABSL_HAVE_FEATURE(cxx_exceptions)
-#else
+#elif defined(__clang__)
 // Clang < 3.6
 // http://releases.llvm.org/3.6.0/tools/clang/docs/ReleaseNotes.html#the-exceptions-macro
 #if defined(__EXCEPTIONS) && ABSL_HAVE_FEATURE(cxx_exceptions)
 #define ABSL_HAVE_EXCEPTIONS 1
 #endif  // defined(__EXCEPTIONS) && ABSL_HAVE_FEATURE(cxx_exceptions)
-#endif  // __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 6)
-
 // Handle remaining special cases and default to exceptions being supported.
-#elif !(defined(__GNUC__) && (__GNUC__ < 5) && !defined(__EXCEPTIONS)) &&    \
-    !(defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__cpp_exceptions)) && \
+#elif !(defined(__GNUC__) && (__GNUC__ < 5) && !defined(__EXCEPTIONS)) && \
+    !(ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(5, 0) &&                        \
+      !defined(__cpp_exceptions)) &&                                      \
     !(defined(_MSC_VER) && !defined(_CPPUNWIND))
 #define ABSL_HAVE_EXCEPTIONS 1
 #endif
@@ -690,10 +700,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 // a compiler instrumentation module and a run-time library.
 #ifdef ABSL_HAVE_MEMORY_SANITIZER
 #error "ABSL_HAVE_MEMORY_SANITIZER cannot be directly set."
-#elif defined(MEMORY_SANITIZER)
-// The MEMORY_SANITIZER macro is deprecated but we will continue to honor it
-// for now.
-#define ABSL_HAVE_MEMORY_SANITIZER 1
 #elif defined(__SANITIZE_MEMORY__)
 #define ABSL_HAVE_MEMORY_SANITIZER 1
 #elif !defined(__native_client__) && ABSL_HAVE_FEATURE(memory_sanitizer)
@@ -705,10 +711,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 // ThreadSanitizer (TSan) is a fast data race detector.
 #ifdef ABSL_HAVE_THREAD_SANITIZER
 #error "ABSL_HAVE_THREAD_SANITIZER cannot be directly set."
-#elif defined(THREAD_SANITIZER)
-// The THREAD_SANITIZER macro is deprecated but we will continue to honor it
-// for now.
-#define ABSL_HAVE_THREAD_SANITIZER 1
 #elif defined(__SANITIZE_THREAD__)
 #define ABSL_HAVE_THREAD_SANITIZER 1
 #elif ABSL_HAVE_FEATURE(thread_sanitizer)
@@ -720,10 +722,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 // AddressSanitizer (ASan) is a fast memory error detector.
 #ifdef ABSL_HAVE_ADDRESS_SANITIZER
 #error "ABSL_HAVE_ADDRESS_SANITIZER cannot be directly set."
-#elif defined(ADDRESS_SANITIZER)
-// The ADDRESS_SANITIZER macro is deprecated but we will continue to honor it
-// for now.
-#define ABSL_HAVE_ADDRESS_SANITIZER 1
 #elif defined(__SANITIZE_ADDRESS__)
 #define ABSL_HAVE_ADDRESS_SANITIZER 1
 #elif ABSL_HAVE_FEATURE(address_sanitizer)
diff --git a/third_party/abseil-cpp/absl/base/dynamic_annotations.h b/third_party/abseil-cpp/absl/base/dynamic_annotations.h
index bf874db990..065bd5be09 100644
--- a/third_party/abseil-cpp/absl/base/dynamic_annotations.h
+++ b/third_party/abseil-cpp/absl/base/dynamic_annotations.h
@@ -468,7 +468,7 @@ using absl::base_internal::ValgrindSlowdown;
   __sanitizer_annotate_contiguous_container(beg, end, old_mid, new_mid)
 #define ABSL_ADDRESS_SANITIZER_REDZONE(name) \
   struct {                                   \
-    char x[8] __attribute__((aligned(8)));   \
+    alignas(8) char x[8];                    \
   } name
 
 #else
diff --git a/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h b/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h
index 6ba89d05df..77a5aec642 100644
--- a/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h
+++ b/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h
@@ -536,7 +536,22 @@ class ThrowingValue : private exceptions_internal::TrackedObject {
   }
 
   // Memory management operators
-  // Args.. allows us to overload regular and placement new in one shot
+  static void* operator new(size_t s) noexcept(
+      IsSpecified(TypeSpec::kNoThrowNew)) {
+    if (!IsSpecified(TypeSpec::kNoThrowNew)) {
+      exceptions_internal::MaybeThrow(ABSL_PRETTY_FUNCTION, true);
+    }
+    return ::operator new(s);
+  }
+
+  static void* operator new[](size_t s) noexcept(
+      IsSpecified(TypeSpec::kNoThrowNew)) {
+    if (!IsSpecified(TypeSpec::kNoThrowNew)) {
+      exceptions_internal::MaybeThrow(ABSL_PRETTY_FUNCTION, true);
+    }
+    return ::operator new[](s);
+  }
+
   template <typename... Args>
   static void* operator new(size_t s, Args&&... args) noexcept(
       IsSpecified(TypeSpec::kNoThrowNew)) {
@@ -557,12 +572,6 @@ class ThrowingValue : private exceptions_internal::TrackedObject {
 
   // Abseil doesn't support throwing overloaded operator delete.  These are
   // provided so a throwing operator-new can clean up after itself.
-  //
-  // We provide both regular and templated operator delete because if only the
-  // templated version is provided as we did with operator new, the compiler has
-  // no way of knowing which overload of operator delete to call. See
-  // https://en.cppreference.com/w/cpp/memory/new/operator_delete and
-  // https://en.cppreference.com/w/cpp/language/delete for the gory details.
   void operator delete(void* p) noexcept { ::operator delete(p); }
 
   template <typename... Args>
@@ -726,9 +735,8 @@ class ThrowingAllocator : private exceptions_internal::TrackedObject {
 
   ThrowingAllocator select_on_container_copy_construction() noexcept(
       IsSpecified(AllocSpec::kNoThrowAllocate)) {
-    auto& out = *this;
     ReadStateAndMaybeThrow(ABSL_PRETTY_FUNCTION);
-    return out;
+    return *this;
   }
 
   template <typename U>
diff --git a/third_party/abseil-cpp/absl/base/internal/sysinfo.cc b/third_party/abseil-cpp/absl/base/internal/sysinfo.cc
index 4a3b205034..08a1e28894 100644
--- a/third_party/abseil-cpp/absl/base/internal/sysinfo.cc
+++ b/third_party/abseil-cpp/absl/base/internal/sysinfo.cc
@@ -61,9 +61,76 @@ namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace base_internal {
 
+namespace {
+
+#if defined(_WIN32)
+
+// Returns number of bits set in `bitMask`
+DWORD Win32CountSetBits(ULONG_PTR bitMask) {
+  for (DWORD bitSetCount = 0; ; ++bitSetCount) {
+    if (bitMask == 0) return bitSetCount;
+    bitMask &= bitMask - 1;
+  }
+}
+
+// Returns the number of logical CPUs using GetLogicalProcessorInformation(), or
+// 0 if the number of processors is not available or can not be computed.
+// https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformation
+int Win32NumCPUs() {
+#pragma comment(lib, "kernel32.lib")
+  using Info = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
+
+  DWORD info_size = sizeof(Info);
+  Info* info(static_cast<Info*>(malloc(info_size)));
+  if (info == nullptr) return 0;
+
+  bool success = GetLogicalProcessorInformation(info, &info_size);
+  if (!success && GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
+    free(info);
+    info = static_cast<Info*>(malloc(info_size));
+    if (info == nullptr) return 0;
+    success = GetLogicalProcessorInformation(info, &info_size);
+  }
+
+  DWORD logicalProcessorCount = 0;
+  if (success) {
+    Info* ptr = info;
+    DWORD byteOffset = 0;
+    while (byteOffset + sizeof(Info) <= info_size) {
+      switch (ptr->Relationship) {
+        case RelationProcessorCore:
+          logicalProcessorCount += Win32CountSetBits(ptr->ProcessorMask);
+          break;
+
+        case RelationNumaNode:
+        case RelationCache:
+        case RelationProcessorPackage:
+          // Ignore other entries
+          break;
+
+        default:
+          // Ignore unknown entries
+          break;
+      }
+      byteOffset += sizeof(Info);
+      ptr++;
+    }
+  }
+  free(info);
+  return logicalProcessorCount;
+}
+
+#endif
+
+}  // namespace
+
+
 static int GetNumCPUs() {
 #if defined(__myriad2__)
   return 1;
+#elif defined(_WIN32)
+  const unsigned hardware_concurrency = Win32NumCPUs();
+  return hardware_concurrency ? hardware_concurrency : 1;
 #else
   // Other possibilities:
   //  - Read /sys/devices/system/cpu/online and use cpumask_parse()
diff --git a/third_party/abseil-cpp/absl/base/internal/thread_identity.cc b/third_party/abseil-cpp/absl/base/internal/thread_identity.cc
index 6ea010ed0d..9950e63a79 100644
--- a/third_party/abseil-cpp/absl/base/internal/thread_identity.cc
+++ b/third_party/abseil-cpp/absl/base/internal/thread_identity.cc
@@ -120,10 +120,10 @@ void SetCurrentThreadIdentity(
     ABSL_THREAD_IDENTITY_MODE == ABSL_THREAD_IDENTITY_MODE_USE_CPP11
 
 // Please see the comment on `CurrentThreadIdentityIfPresent` in
-// thread_identity.h. Because DLLs cannot expose thread_local variables in
-// headers, we opt for the correct-but-slower option of placing the definition
-// of this function only in a translation unit inside DLL.
-#if defined(ABSL_BUILD_DLL) || defined(ABSL_CONSUME_DLL)
+// thread_identity.h. When we cannot expose thread_local variables in
+// headers, we opt for the correct-but-slower option of not inlining this
+// function.
+#ifndef ABSL_INTERNAL_INLINE_CURRENT_THREAD_IDENTITY_IF_PRESENT
 ThreadIdentity* CurrentThreadIdentityIfPresent() { return thread_identity_ptr; }
 #endif
 #endif
diff --git a/third_party/abseil-cpp/absl/base/internal/thread_identity.h b/third_party/abseil-cpp/absl/base/internal/thread_identity.h
index 9ee651a3a6..6e25b92fa2 100644
--- a/third_party/abseil-cpp/absl/base/internal/thread_identity.h
+++ b/third_party/abseil-cpp/absl/base/internal/thread_identity.h
@@ -236,13 +236,18 @@ ABSL_CONST_INIT extern thread_local ThreadIdentity* thread_identity_ptr;
 #error Thread-local storage not detected on this platform
 #endif
 
-// thread_local variables cannot be in headers exposed by DLLs. However, it is
-// important for performance reasons in general that
-// `CurrentThreadIdentityIfPresent` be inlined. This is not possible across a
-// DLL boundary so, with DLLs, we opt to have the function not be inlined. Note
+// thread_local variables cannot be in headers exposed by DLLs or in certain
+// build configurations on Apple platforms. However, it is important for
+// performance reasons in general that `CurrentThreadIdentityIfPresent` be
+// inlined. In the other cases we opt to have the function not be inlined. Note
 // that `CurrentThreadIdentityIfPresent` is declared above so we can exclude
-// this entire inline definition when compiling as a DLL.
-#if !defined(ABSL_BUILD_DLL) && !defined(ABSL_CONSUME_DLL)
+// this entire inline definition.
+#if !defined(__APPLE__) && !defined(ABSL_BUILD_DLL) && \
+    !defined(ABSL_CONSUME_DLL)
+#define ABSL_INTERNAL_INLINE_CURRENT_THREAD_IDENTITY_IF_PRESENT 1
+#endif
+
+#ifdef ABSL_INTERNAL_INLINE_CURRENT_THREAD_IDENTITY_IF_PRESENT
 inline ThreadIdentity* CurrentThreadIdentityIfPresent() {
   return thread_identity_ptr;
 }
diff --git a/third_party/abseil-cpp/absl/base/optimization.h b/third_party/abseil-cpp/absl/base/optimization.h
index 6332b62584..d090be1286 100644
--- a/third_party/abseil-cpp/absl/base/optimization.h
+++ b/third_party/abseil-cpp/absl/base/optimization.h
@@ -106,9 +106,10 @@
 // Cacheline aligning objects properly allows constructive memory sharing and
 // prevents destructive (or "false") memory sharing.
 //
-// NOTE: this macro should be replaced with usage of `alignas()` using
+// NOTE: callers should replace uses of this macro with `alignas()` using
 // `std::hardware_constructive_interference_size` and/or
-// `std::hardware_destructive_interference_size` when available within C++17.
+// `std::hardware_destructive_interference_size` when C++17 becomes available to
+// them.
 //
 // See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0154r1.html
 // for more information.
diff --git a/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt b/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt
index a2dd78a84a..26a6d0dce3 100644
--- a/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt
@@ -51,5 +51,5 @@ absl_cc_test(
     absl::cleanup
     absl::config
     absl::utility
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/container/CMakeLists.txt b/third_party/abseil-cpp/absl/container/CMakeLists.txt
index 2d7d0e65f2..91c4015437 100644
--- a/third_party/abseil-cpp/absl/container/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/container/CMakeLists.txt
@@ -80,7 +80,7 @@ absl_cc_test(
     absl::strings
     absl::test_instance_tracker
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -109,7 +109,7 @@ absl_cc_test(
     absl::optional
     absl::test_instance_tracker
     absl::utility
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -144,7 +144,7 @@ absl_cc_test(
     absl::exception_testing
     absl::hash_testing
     absl::memory
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -158,7 +158,7 @@ absl_cc_test(
     absl::fixed_array
     absl::config
     absl::exception_safety_testing
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -222,7 +222,7 @@ absl_cc_test(
     absl::memory
     absl::raw_logging_internal
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -236,7 +236,7 @@ absl_cc_test(
     absl::inlined_vector
     absl::config
     absl::exception_safety_testing
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -262,7 +262,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::test_instance_tracker
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -297,7 +297,7 @@ absl_cc_test(
     absl::unordered_map_modifiers_test
     absl::any
     absl::raw_logging_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -335,7 +335,7 @@ absl_cc_test(
     absl::memory
     absl::raw_logging_internal
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -370,7 +370,7 @@ absl_cc_test(
     absl::unordered_map_lookup_test
     absl::unordered_map_members_test
     absl::unordered_map_modifiers_test
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -404,7 +404,7 @@ absl_cc_test(
     absl::unordered_set_lookup_test
     absl::unordered_set_members_test
     absl::unordered_set_modifiers_test
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -433,7 +433,7 @@ absl_cc_test(
     absl::container_memory
     absl::strings
     absl::test_instance_tracker
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -465,7 +465,7 @@ absl_cc_test(
     absl::hash
     absl::random_random
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -507,7 +507,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::hash_policy_testing
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -531,7 +531,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::hash_policy_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -561,7 +561,7 @@ absl_cc_test(
   DEPS
     absl::hashtablez_sampler
     absl::have_sse
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -618,7 +618,7 @@ absl_cc_test(
   DEPS
     absl::hash_policy_traits
     absl::node_hash_policy
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -693,7 +693,7 @@ absl_cc_test(
     absl::core_headers
     absl::raw_logging_internal
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -707,7 +707,7 @@ absl_cc_test(
     absl::raw_hash_set
     absl::tracked
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -740,7 +740,7 @@ absl_cc_test(
     absl::core_headers
     absl::raw_logging_internal
     absl::span
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -765,7 +765,7 @@ absl_cc_library(
   DEPS
     absl::hash_generator_testing
     absl::hash_policy_testing
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -779,7 +779,7 @@ absl_cc_library(
   DEPS
     absl::hash_generator_testing
     absl::hash_policy_testing
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -792,7 +792,7 @@ absl_cc_library(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::type_traits
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -806,7 +806,7 @@ absl_cc_library(
   DEPS
     absl::hash_generator_testing
     absl::hash_policy_testing
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -820,7 +820,7 @@ absl_cc_library(
   DEPS
     absl::hash_generator_testing
     absl::hash_policy_testing
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -834,7 +834,7 @@ absl_cc_library(
   DEPS
     absl::hash_generator_testing
     absl::hash_policy_testing
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -847,7 +847,7 @@ absl_cc_library(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::type_traits
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -861,7 +861,7 @@ absl_cc_library(
   DEPS
     absl::hash_generator_testing
     absl::hash_policy_testing
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -877,7 +877,7 @@ absl_cc_test(
     absl::unordered_set_lookup_test
     absl::unordered_set_members_test
     absl::unordered_set_modifiers_test
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -892,5 +892,5 @@ absl_cc_test(
     absl::unordered_map_lookup_test
     absl::unordered_map_members_test
     absl::unordered_map_modifiers_test
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/container/btree_test.cc b/third_party/abseil-cpp/absl/container/btree_test.cc
index 74337df2c1..d5d79151aa 100644
--- a/third_party/abseil-cpp/absl/container/btree_test.cc
+++ b/third_party/abseil-cpp/absl/container/btree_test.cc
@@ -1708,10 +1708,25 @@ TEST(Btree, StrSplitCompatible) {
   EXPECT_EQ(split_set, expected_set);
 }
 
-// We can't use EXPECT_EQ/etc. to compare absl::weak_ordering because they
-// convert literal 0 to int and absl::weak_ordering can only be compared with
-// literal 0. Defining this function allows for avoiding ClangTidy warnings.
-bool Identity(const bool b) { return b; }
+TEST(Btree, KeyComp) {
+  absl::btree_set<int> s;
+  EXPECT_TRUE(s.key_comp()(1, 2));
+  EXPECT_FALSE(s.key_comp()(2, 2));
+  EXPECT_FALSE(s.key_comp()(2, 1));
+
+  absl::btree_map<int, int> m1;
+  EXPECT_TRUE(m1.key_comp()(1, 2));
+  EXPECT_FALSE(m1.key_comp()(2, 2));
+  EXPECT_FALSE(m1.key_comp()(2, 1));
+
+  // Even though we internally adapt the comparator of `m2` to be three-way and
+  // heterogeneous, the comparator we expose through key_comp() is the original
+  // unadapted comparator.
+  absl::btree_map<std::string, int> m2;
+  EXPECT_TRUE(m2.key_comp()("a", "b"));
+  EXPECT_FALSE(m2.key_comp()("b", "b"));
+  EXPECT_FALSE(m2.key_comp()("b", "a"));
+}
 
 TEST(Btree, ValueComp) {
   absl::btree_set<int> s;
@@ -1724,13 +1739,13 @@ TEST(Btree, ValueComp) {
   EXPECT_FALSE(m1.value_comp()(std::make_pair(2, 0), std::make_pair(2, 0)));
   EXPECT_FALSE(m1.value_comp()(std::make_pair(2, 0), std::make_pair(1, 0)));
 
+  // Even though we internally adapt the comparator of `m2` to be three-way and
+  // heterogeneous, the comparator we expose through value_comp() is based on
+  // the original unadapted comparator.
   absl::btree_map<std::string, int> m2;
-  EXPECT_TRUE(Identity(
-      m2.value_comp()(std::make_pair("a", 0), std::make_pair("b", 0)) < 0));
-  EXPECT_TRUE(Identity(
-      m2.value_comp()(std::make_pair("b", 0), std::make_pair("b", 0)) == 0));
-  EXPECT_TRUE(Identity(
-      m2.value_comp()(std::make_pair("b", 0), std::make_pair("a", 0)) > 0));
+  EXPECT_TRUE(m2.value_comp()(std::make_pair("a", 0), std::make_pair("b", 0)));
+  EXPECT_FALSE(m2.value_comp()(std::make_pair("b", 0), std::make_pair("b", 0)));
+  EXPECT_FALSE(m2.value_comp()(std::make_pair("b", 0), std::make_pair("a", 0)));
 }
 
 TEST(Btree, DefaultConstruction) {
@@ -2893,6 +2908,46 @@ TEST(Btree, AllocMoveConstructor_DifferentAlloc) {
   EXPECT_EQ(bytes_used2, original_bytes_used);
 }
 
+bool IntCmp(const int a, const int b) { return a < b; }
+
+TEST(Btree, SupportsFunctionPtrComparator) {
+  absl::btree_set<int, decltype(IntCmp) *> set(IntCmp);
+  set.insert({1, 2, 3});
+  EXPECT_THAT(set, ElementsAre(1, 2, 3));
+  EXPECT_TRUE(set.key_comp()(1, 2));
+  EXPECT_TRUE(set.value_comp()(1, 2));
+
+  absl::btree_map<int, int, decltype(IntCmp) *> map(&IntCmp);
+  map[1] = 1;
+  EXPECT_THAT(map, ElementsAre(Pair(1, 1)));
+  EXPECT_TRUE(map.key_comp()(1, 2));
+  EXPECT_TRUE(map.value_comp()(std::make_pair(1, 1), std::make_pair(2, 2)));
+}
+
+template <typename Compare>
+struct TransparentPassThroughComp {
+  using is_transparent = void;
+
+  // This will fail compilation if we attempt a comparison that Compare does not
+  // support, and the failure will happen inside the function implementation so
+  // it can't be avoided by using SFINAE on this comparator.
+  template <typename T, typename U>
+  bool operator()(const T &lhs, const U &rhs) const {
+    return Compare()(lhs, rhs);
+  }
+};
+
+TEST(Btree,
+     SupportsTransparentComparatorThatDoesNotImplementAllVisibleOperators) {
+  absl::btree_set<MultiKey, TransparentPassThroughComp<MultiKeyComp>> set;
+  set.insert(MultiKey{1, 2});
+  EXPECT_TRUE(set.contains(1));
+}
+
+TEST(Btree, ConstructImplicitlyWithUnadaptedComparator) {
+  absl::btree_set<MultiKey, MultiKeyComp> set = {{}, MultiKeyComp{}};
+}
+
 }  // namespace
 }  // namespace container_internal
 ABSL_NAMESPACE_END
diff --git a/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc b/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc
index 89ec60c916..8dda1d3539 100644
--- a/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc
+++ b/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc
@@ -282,6 +282,32 @@ TEST(FlatHashMap, NodeHandleMutableKeyAccess) {
 }
 #endif
 
+TEST(FlatHashMap, Reserve) {
+  // Verify that if we reserve(size() + n) then we can perform n insertions
+  // without a rehash, i.e., without invalidating any references.
+  for (size_t trial = 0; trial < 20; ++trial) {
+    for (size_t initial = 3; initial < 100; ++initial) {
+      // Fill in `initial` entries, then erase 2 of them, then reserve space for
+      // two inserts and check for reference stability while doing the inserts.
+      flat_hash_map<size_t, size_t> map;
+      for (size_t i = 0; i < initial; ++i) {
+        map[i] = i;
+      }
+      map.erase(0);
+      map.erase(1);
+      map.reserve(map.size() + 2);
+      size_t& a2 = map[2];
+      // In the event of a failure, asan will complain in one of these two
+      // assignments.
+      map[initial] = a2;
+      map[initial + 1] = a2;
+      // Fail even when not under asan:
+      size_t& a2new = map[2];
+      EXPECT_EQ(&a2, &a2new);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace container_internal
 ABSL_NAMESPACE_END
diff --git a/third_party/abseil-cpp/absl/container/internal/btree.h b/third_party/abseil-cpp/absl/container/internal/btree.h
index 00444a5397..f636c5fc73 100644
--- a/third_party/abseil-cpp/absl/container/internal/btree.h
+++ b/third_party/abseil-cpp/absl/container/internal/btree.h
@@ -88,7 +88,12 @@ struct StringBtreeDefaultLess {
 
   // Compatibility constructor.
   StringBtreeDefaultLess(std::less<std::string>) {}  // NOLINT
-  StringBtreeDefaultLess(std::less<string_view>) {}  // NOLINT
+  StringBtreeDefaultLess(std::less<absl::string_view>) {}  // NOLINT
+
+  // Allow converting to std::less for use in key_comp()/value_comp().
+  explicit operator std::less<std::string>() const { return {}; }
+  explicit operator std::less<absl::string_view>() const { return {}; }
+  explicit operator std::less<absl::Cord>() const { return {}; }
 
   absl::weak_ordering operator()(absl::string_view lhs,
                                  absl::string_view rhs) const {
@@ -115,7 +120,12 @@ struct StringBtreeDefaultGreater {
   StringBtreeDefaultGreater() = default;
 
   StringBtreeDefaultGreater(std::greater<std::string>) {}  // NOLINT
-  StringBtreeDefaultGreater(std::greater<string_view>) {}  // NOLINT
+  StringBtreeDefaultGreater(std::greater<absl::string_view>) {}  // NOLINT
+
+  // Allow converting to std::greater for use in key_comp()/value_comp().
+  explicit operator std::greater<std::string>() const { return {}; }
+  explicit operator std::greater<absl::string_view>() const { return {}; }
+  explicit operator std::greater<absl::Cord>() const { return {}; }
 
   absl::weak_ordering operator()(absl::string_view lhs,
                                  absl::string_view rhs) const {
@@ -217,6 +227,8 @@ struct prefers_linear_node_search<
 template <typename Key, typename Compare, typename Alloc, int TargetNodeSize,
           bool Multi, typename SlotPolicy>
 struct common_params {
+  using original_key_compare = Compare;
+
   // If Compare is a common comparator for a string-like type, then we adapt it
   // to use heterogeneous lookup and to be a key-compare-to comparator.
   using key_compare = typename key_compare_to_adapter<Compare>::type;
@@ -317,16 +329,21 @@ struct map_params : common_params<Key, Compare, Alloc, TargetNodeSize, Multi,
   using value_type = typename super_type::value_type;
   using init_type = typename super_type::init_type;
 
-  using key_compare = typename super_type::key_compare;
-  // Inherit from key_compare for empty base class optimization.
-  struct value_compare : private key_compare {
-    value_compare() = default;
-    explicit value_compare(const key_compare &cmp) : key_compare(cmp) {}
+  using original_key_compare = typename super_type::original_key_compare;
+  // Reference: https://en.cppreference.com/w/cpp/container/map/value_compare
+  class value_compare {
+    template <typename Params>
+    friend class btree;
 
-    template <typename T, typename U>
-    auto operator()(const T &left, const U &right) const
-        -> decltype(std::declval<key_compare>()(left.first, right.first)) {
-      return key_compare::operator()(left.first, right.first);
+   protected:
+    explicit value_compare(original_key_compare c) : comp(std::move(c)) {}
+
+    original_key_compare comp;  // NOLINT
+
+   public:
+    auto operator()(const value_type &lhs, const value_type &rhs) const
+        -> decltype(comp(lhs.first, rhs.first)) {
+      return comp(lhs.first, rhs.first);
     }
   };
   using is_map_container = std::true_type;
@@ -392,7 +409,8 @@ struct set_params : common_params<Key, Compare, Alloc, TargetNodeSize, Multi,
                                   set_slot_policy<Key>> {
   using value_type = Key;
   using slot_type = typename set_params::common_params::slot_type;
-  using value_compare = typename set_params::common_params::key_compare;
+  using value_compare =
+      typename set_params::common_params::original_key_compare;
   using is_map_container = std::false_type;
 
   template <typename V>
@@ -484,8 +502,8 @@ class btree_node {
                        std::is_same<std::greater<key_type>,
                                     key_compare>::value)>;
 
-  // This class is organized by gtl::Layout as if it had the following
-  // structure:
+  // This class is organized by absl::container_internal::Layout as if it had
+  // the following structure:
   //   // A pointer to the node's parent.
   //   btree_node *parent;
   //
@@ -579,10 +597,10 @@ class btree_node {
   };
 
   // Leaves can have less than kNodeSlots values.
-  constexpr static layout_type LeafLayout(const int slots = kNodeSlots) {
+  constexpr static layout_type LeafLayout(const int slot_count = kNodeSlots) {
     return layout_type(/*parent*/ 1,
                        /*position, start, finish, max_count*/ 4,
-                       /*slots*/ slots,
+                       /*slots*/ slot_count,
                        /*children*/ 0);
   }
   constexpr static layout_type InternalLayout() {
@@ -591,8 +609,8 @@ class btree_node {
                        /*slots*/ kNodeSlots,
                        /*children*/ kNodeSlots + 1);
   }
-  constexpr static size_type LeafSize(const int slots = kNodeSlots) {
-    return LeafLayout(slots).AllocSize();
+  constexpr static size_type LeafSize(const int slot_count = kNodeSlots) {
+    return LeafLayout(slot_count).AllocSize();
   }
   constexpr static size_type InternalSize() {
     return InternalLayout().AllocSize();
@@ -1129,6 +1147,7 @@ class btree {
   using size_type = typename Params::size_type;
   using difference_type = typename Params::difference_type;
   using key_compare = typename Params::key_compare;
+  using original_key_compare = typename Params::original_key_compare;
   using value_compare = typename Params::value_compare;
   using allocator_type = typename Params::allocator_type;
   using reference = typename Params::reference;
@@ -1338,7 +1357,9 @@ class btree {
     return compare_internal::compare_result_as_less_than(key_comp()(a, b));
   }
 
-  value_compare value_comp() const { return value_compare(key_comp()); }
+  value_compare value_comp() const {
+    return value_compare(original_key_compare(key_comp()));
+  }
 
   // Verifies the structure of the btree.
   void verify() const;
diff --git a/third_party/abseil-cpp/absl/container/internal/btree_container.h b/third_party/abseil-cpp/absl/container/internal/btree_container.h
index 03be708e4f..a99668c713 100644
--- a/third_party/abseil-cpp/absl/container/internal/btree_container.h
+++ b/third_party/abseil-cpp/absl/container/internal/btree_container.h
@@ -20,6 +20,7 @@
 #include <iterator>
 #include <utility>
 
+#include "absl/base/attributes.h"
 #include "absl/base/internal/throw_delegate.h"
 #include "absl/container/internal/btree.h"  // IWYU pragma: export
 #include "absl/container/internal/common.h"
@@ -51,7 +52,7 @@ class btree_container {
   using value_type = typename Tree::value_type;
   using size_type = typename Tree::size_type;
   using difference_type = typename Tree::difference_type;
-  using key_compare = typename Tree::key_compare;
+  using key_compare = typename Tree::original_key_compare;
   using value_compare = typename Tree::value_compare;
   using allocator_type = typename Tree::allocator_type;
   using reference = typename Tree::reference;
@@ -176,7 +177,7 @@ class btree_container {
   }
 
   // Utility routines.
-  void clear() { tree_.clear(); }
+  ABSL_ATTRIBUTE_REINITIALIZES void clear() { tree_.clear(); }
   void swap(btree_container &other) { tree_.swap(other.tree_); }
   void verify() const { tree_.verify(); }
 
@@ -214,7 +215,7 @@ class btree_container {
   allocator_type get_allocator() const { return tree_.get_allocator(); }
 
   // The key comparator used by the btree.
-  key_compare key_comp() const { return tree_.key_comp(); }
+  key_compare key_comp() const { return key_compare(tree_.key_comp()); }
   value_compare value_comp() const { return tree_.value_comp(); }
 
   // Support absl::Hash.
@@ -247,7 +248,7 @@ class btree_set_container : public btree_container<Tree> {
   using key_type = typename Tree::key_type;
   using value_type = typename Tree::value_type;
   using size_type = typename Tree::size_type;
-  using key_compare = typename Tree::key_compare;
+  using key_compare = typename Tree::original_key_compare;
   using allocator_type = typename Tree::allocator_type;
   using iterator = typename Tree::iterator;
   using const_iterator = typename Tree::const_iterator;
@@ -398,7 +399,7 @@ class btree_map_container : public btree_set_container<Tree> {
   using key_type = typename Tree::key_type;
   using mapped_type = typename params_type::mapped_type;
   using value_type = typename Tree::value_type;
-  using key_compare = typename Tree::key_compare;
+  using key_compare = typename Tree::original_key_compare;
   using allocator_type = typename Tree::allocator_type;
   using iterator = typename Tree::iterator;
   using const_iterator = typename Tree::const_iterator;
@@ -543,7 +544,7 @@ class btree_multiset_container : public btree_container<Tree> {
   using key_type = typename Tree::key_type;
   using value_type = typename Tree::value_type;
   using size_type = typename Tree::size_type;
-  using key_compare = typename Tree::key_compare;
+  using key_compare = typename Tree::original_key_compare;
   using allocator_type = typename Tree::allocator_type;
   using iterator = typename Tree::iterator;
   using const_iterator = typename Tree::const_iterator;
diff --git a/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h b/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h
index 6869fe45e8..f1f555a5c1 100644
--- a/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h
+++ b/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h
@@ -21,11 +21,13 @@
 #include <stdint.h>
 
 #include <algorithm>
+#include <cassert>
 #include <iosfwd>
 #include <random>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "absl/container/internal/hash_policy_testing.h"
 #include "absl/memory/memory.h"
@@ -153,6 +155,25 @@ using GeneratedType = decltype(
                                   typename Container::value_type,
                                   typename Container::key_type>::type>&>()());
 
+// Naive wrapper that performs a linear search of previous values.
+// Beware this is O(SQR), which is reasonable for smaller kMaxValues.
+template <class T, size_t kMaxValues = 64, class E = void>
+struct UniqueGenerator {
+  Generator<T, E> gen;
+  std::vector<T> values;
+
+  T operator()() {
+    assert(values.size() < kMaxValues);
+    for (;;) {
+      T value = gen();
+      if (std::find(values.begin(), values.end(), value) == values.end()) {
+        values.push_back(value);
+        return value;
+      }
+    }
+  }
+};
+
 }  // namespace hash_internal
 }  // namespace container_internal
 ABSL_NAMESPACE_END
diff --git a/third_party/abseil-cpp/absl/container/internal/inlined_vector.h b/third_party/abseil-cpp/absl/container/internal/inlined_vector.h
index b8aec45b79..49822af0b7 100644
--- a/third_party/abseil-cpp/absl/container/internal/inlined_vector.h
+++ b/third_party/abseil-cpp/absl/container/internal/inlined_vector.h
@@ -36,6 +36,7 @@ namespace inlined_vector_internal {
 // GCC does not deal very well with the below code
 #if !defined(__clang__) && defined(__GNUC__)
 #pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
 
@@ -955,7 +956,7 @@ auto Storage<T, N, A>::Swap(Storage* other_storage_ptr) -> void {
   swap(*GetAllocPtr(), *other_storage_ptr->GetAllocPtr());
 }
 
-// End ignore "maybe-uninitialized"
+// End ignore "array-bounds" and "maybe-uninitialized"
 #if !defined(__clang__) && defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
diff --git a/third_party/abseil-cpp/absl/container/internal/layout.h b/third_party/abseil-cpp/absl/container/internal/layout.h
index 2336783315..a59a243059 100644
--- a/third_party/abseil-cpp/absl/container/internal/layout.h
+++ b/third_party/abseil-cpp/absl/container/internal/layout.h
@@ -404,7 +404,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>,
   constexpr size_t Offset() const {
     static_assert(N < NumOffsets, "Index out of bounds");
     return adl_barrier::Align(
-        Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1],
+        Offset<N - 1>() + SizeOf<ElementType<N - 1>>::value * size_[N - 1],
         ElementAlignment<N>::value);
   }
 
@@ -597,7 +597,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>,
   constexpr size_t AllocSize() const {
     static_assert(NumTypes == NumSizes, "You must specify sizes of all fields");
     return Offset<NumTypes - 1>() +
-           SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1];
+        SizeOf<ElementType<NumTypes - 1>>::value * size_[NumTypes - 1];
   }
 
   // If built with --config=asan, poisons padding bytes (if any) in the
@@ -621,7 +621,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>,
     // The `if` is an optimization. It doesn't affect the observable behaviour.
     if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) {
       size_t start =
-          Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1];
+          Offset<N - 1>() + SizeOf<ElementType<N - 1>>::value * size_[N - 1];
       ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start);
     }
 #endif
@@ -645,7 +645,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>,
   // produce "unsigned*" where another produces "unsigned int *".
   std::string DebugString() const {
     const auto offsets = Offsets();
-    const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...};
+    const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>::value...};
     const std::string types[] = {
         adl_barrier::TypeName<ElementType<OffsetSeq>>()...};
     std::string res = absl::StrCat("@0", types[0], "(", sizes[0], ")");
diff --git a/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h b/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h
index 0a02757ddf..c7df2efc62 100644
--- a/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h
+++ b/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h
@@ -51,8 +51,9 @@ class raw_hash_map : public raw_hash_set<Policy, Hash, Eq, Alloc> {
   using key_arg = typename KeyArgImpl::template type<K, key_type>;
 
   static_assert(!std::is_reference<key_type>::value, "");
-  // TODO(alkis): remove this assertion and verify that reference mapped_type is
-  // supported.
+
+  // TODO(b/187807849): Evaluate whether to support reference mapped_type and
+  // remove this assertion if/when it is supported.
   static_assert(!std::is_reference<mapped_type>::value, "");
 
   using iterator = typename raw_hash_map::raw_hash_set::iterator;
diff --git a/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h b/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h
index 80fc2cba3f..aa78265ca1 100644
--- a/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h
+++ b/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h
@@ -628,7 +628,9 @@ class raw_hash_set {
 
   static Layout MakeLayout(size_t capacity) {
     assert(IsValidCapacity(capacity));
-    return Layout(capacity + Group::kWidth + 1, capacity);
+    // The extra control bytes are for 1 sentinel byte followed by
+    // `Group::kWidth - 1` bytes that are cloned from the beginning.
+    return Layout(capacity + Group::kWidth, capacity);
   }
 
   using AllocTraits = absl::allocator_traits<allocator_type>;
@@ -792,7 +794,8 @@ class raw_hash_set {
   explicit raw_hash_set(size_t bucket_count, const hasher& hash = hasher(),
                         const key_equal& eq = key_equal(),
                         const allocator_type& alloc = allocator_type())
-      : ctrl_(EmptyGroup()), settings_(0, hash, eq, alloc) {
+      : ctrl_(EmptyGroup()),
+        settings_(0, HashtablezInfoHandle(), hash, eq, alloc) {
     if (bucket_count) {
       capacity_ = NormalizeCapacity(bucket_count);
       initialize_slots();
@@ -903,7 +906,7 @@ class raw_hash_set {
       auto target = find_first_non_full(ctrl_, hash, capacity_);
       set_ctrl(target.offset, H2(hash));
       emplace_at(target.offset, v);
-      infoz_.RecordInsert(hash, target.probe_length);
+      infoz().RecordInsert(hash, target.probe_length);
     }
     size_ = that.size();
     growth_left() -= that.size();
@@ -917,28 +920,27 @@ class raw_hash_set {
         slots_(absl::exchange(that.slots_, nullptr)),
         size_(absl::exchange(that.size_, 0)),
         capacity_(absl::exchange(that.capacity_, 0)),
-        infoz_(absl::exchange(that.infoz_, HashtablezInfoHandle())),
         // Hash, equality and allocator are copied instead of moved because
         // `that` must be left valid. If Hash is std::function<Key>, moving it
         // would create a nullptr functor that cannot be called.
-        settings_(that.settings_) {
-    // growth_left was copied above, reset the one from `that`.
-    that.growth_left() = 0;
-  }
+        settings_(absl::exchange(that.growth_left(), 0),
+                  absl::exchange(that.infoz(), HashtablezInfoHandle()),
+                  that.hash_ref(), that.eq_ref(), that.alloc_ref()) {}
 
   raw_hash_set(raw_hash_set&& that, const allocator_type& a)
       : ctrl_(EmptyGroup()),
         slots_(nullptr),
         size_(0),
         capacity_(0),
-        settings_(0, that.hash_ref(), that.eq_ref(), a) {
+        settings_(0, HashtablezInfoHandle(), that.hash_ref(), that.eq_ref(),
+                  a) {
     if (a == that.alloc_ref()) {
       std::swap(ctrl_, that.ctrl_);
       std::swap(slots_, that.slots_);
       std::swap(size_, that.size_);
       std::swap(capacity_, that.capacity_);
       std::swap(growth_left(), that.growth_left());
-      std::swap(infoz_, that.infoz_);
+      std::swap(infoz(), that.infoz());
     } else {
       reserve(that.size());
       // Note: this will copy elements of dense_set and unordered_set instead of
@@ -1009,7 +1011,7 @@ class raw_hash_set {
       reset_growth_left();
     }
     assert(empty());
-    infoz_.RecordStorageChanged(0, capacity_);
+    infoz().RecordStorageChanged(0, capacity_);
   }
 
   // This overload kicks in when the argument is an rvalue of insertable and
@@ -1301,7 +1303,7 @@ class raw_hash_set {
     swap(growth_left(), that.growth_left());
     swap(hash_ref(), that.hash_ref());
     swap(eq_ref(), that.eq_ref());
-    swap(infoz_, that.infoz_);
+    swap(infoz(), that.infoz());
     SwapAlloc(alloc_ref(), that.alloc_ref(),
               typename AllocTraits::propagate_on_container_swap{});
   }
@@ -1310,7 +1312,7 @@ class raw_hash_set {
     if (n == 0 && capacity_ == 0) return;
     if (n == 0 && size_ == 0) {
       destroy_slots();
-      infoz_.RecordStorageChanged(0, 0);
+      infoz().RecordStorageChanged(0, 0);
       return;
     }
     // bitor is a faster way of doing `max` here. We will round up to the next
@@ -1323,8 +1325,8 @@ class raw_hash_set {
   }
 
   void reserve(size_t n) {
-    size_t m = GrowthToLowerboundCapacity(n);
-    if (m > capacity_) {
+    if (n > size() + growth_left()) {
+      size_t m = GrowthToLowerboundCapacity(n);
       resize(NormalizeCapacity(m));
     }
   }
@@ -1528,7 +1530,7 @@ class raw_hash_set {
 
     set_ctrl(index, was_never_full ? kEmpty : kDeleted);
     growth_left() += was_never_full;
-    infoz_.RecordErase();
+    infoz().RecordErase();
   }
 
   void initialize_slots() {
@@ -1545,17 +1547,17 @@ class raw_hash_set {
     // bound more carefully.
     if (std::is_same<SlotAlloc, std::allocator<slot_type>>::value &&
         slots_ == nullptr) {
-      infoz_ = Sample();
+      infoz() = Sample();
     }
 
     auto layout = MakeLayout(capacity_);
     char* mem = static_cast<char*>(
         Allocate<Layout::Alignment()>(&alloc_ref(), layout.AllocSize()));
-    ctrl_ = reinterpret_cast<ctrl_t*>(layout.template Pointer<0>(mem));
+    ctrl_ = layout.template Pointer<0>(mem);
     slots_ = layout.template Pointer<1>(mem);
     reset_ctrl();
     reset_growth_left();
-    infoz_.RecordStorageChanged(size_, capacity_);
+    infoz().RecordStorageChanged(size_, capacity_);
   }
 
   void destroy_slots() {
@@ -1603,7 +1605,7 @@ class raw_hash_set {
       Deallocate<Layout::Alignment()>(&alloc_ref(), old_ctrl,
                                       layout.AllocSize());
     }
-    infoz_.RecordRehash(total_probe_length);
+    infoz().RecordRehash(total_probe_length);
   }
 
   void drop_deletes_without_resize() ABSL_ATTRIBUTE_NOINLINE {
@@ -1669,7 +1671,7 @@ class raw_hash_set {
       }
     }
     reset_growth_left();
-    infoz_.RecordRehash(total_probe_length);
+    infoz().RecordRehash(total_probe_length);
   }
 
   void rehash_and_grow_if_necessary() {
@@ -1743,7 +1745,7 @@ class raw_hash_set {
     ++size_;
     growth_left() -= IsEmpty(ctrl_[target.offset]);
     set_ctrl(target.offset, H2(hash));
-    infoz_.RecordInsert(hash, target.probe_length);
+    infoz().RecordInsert(hash, target.probe_length);
     return target.offset;
   }
 
@@ -1782,8 +1784,8 @@ class raw_hash_set {
     growth_left() = CapacityToGrowth(capacity()) - size_;
   }
 
-  // Sets the control byte, and if `i < Group::kWidth`, set the cloned byte at
-  // the end too.
+  // Sets the control byte, and if `i < Group::kWidth - 1`, set the cloned byte
+  // at the end too.
   void set_ctrl(size_t i, ctrl_t h) {
     assert(i < capacity_);
 
@@ -1794,32 +1796,35 @@ class raw_hash_set {
     }
 
     ctrl_[i] = h;
-    ctrl_[((i - Group::kWidth) & capacity_) + 1 +
-          ((Group::kWidth - 1) & capacity_)] = h;
+    constexpr size_t kClonedBytes = Group::kWidth - 1;
+    ctrl_[((i - kClonedBytes) & capacity_) + (kClonedBytes & capacity_)] = h;
   }
 
   size_t& growth_left() { return settings_.template get<0>(); }
 
-  hasher& hash_ref() { return settings_.template get<1>(); }
-  const hasher& hash_ref() const { return settings_.template get<1>(); }
-  key_equal& eq_ref() { return settings_.template get<2>(); }
-  const key_equal& eq_ref() const { return settings_.template get<2>(); }
-  allocator_type& alloc_ref() { return settings_.template get<3>(); }
+  HashtablezInfoHandle& infoz() { return settings_.template get<1>(); }
+
+  hasher& hash_ref() { return settings_.template get<2>(); }
+  const hasher& hash_ref() const { return settings_.template get<2>(); }
+  key_equal& eq_ref() { return settings_.template get<3>(); }
+  const key_equal& eq_ref() const { return settings_.template get<3>(); }
+  allocator_type& alloc_ref() { return settings_.template get<4>(); }
   const allocator_type& alloc_ref() const {
-    return settings_.template get<3>();
+    return settings_.template get<4>();
   }
 
   // TODO(alkis): Investigate removing some of these fields:
   // - ctrl/slots can be derived from each other
   // - size can be moved into the slot array
-  ctrl_t* ctrl_ = EmptyGroup();    // [(capacity + 1) * ctrl_t]
+  ctrl_t* ctrl_ = EmptyGroup();    // [(capacity + Group::kWidth) * ctrl_t]
   slot_type* slots_ = nullptr;     // [capacity * slot_type]
   size_t size_ = 0;                // number of full slots
   size_t capacity_ = 0;            // total number of slots
-  HashtablezInfoHandle infoz_;
-  absl::container_internal::CompressedTuple<size_t /* growth_left */, hasher,
+  absl::container_internal::CompressedTuple<size_t /* growth_left */,
+                                            HashtablezInfoHandle, hasher,
                                             key_equal, allocator_type>
-      settings_{0, hasher{}, key_equal{}, allocator_type{}};
+      settings_{0, HashtablezInfoHandle{}, hasher{}, key_equal{},
+                allocator_type{}};
 };
 
 // Erases all elements that satisfy the predicate `pred` from the container `c`.
diff --git a/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc b/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc
index 81c4b47c04..af882ef49f 100644
--- a/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc
+++ b/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc
@@ -419,6 +419,13 @@ TEST(Table, EmptyFunctorOptimization) {
     size_t growth_left;
     void* infoz;
   };
+  struct MockTableInfozDisabled {
+    void* ctrl;
+    void* slots;
+    size_t size;
+    size_t capacity;
+    size_t growth_left;
+  };
   struct StatelessHash {
     size_t operator()(absl::string_view) const { return 0; }
   };
@@ -426,17 +433,27 @@ TEST(Table, EmptyFunctorOptimization) {
     size_t dummy;
   };
 
-  EXPECT_EQ(
-      sizeof(MockTable),
-      sizeof(
-          raw_hash_set<StringPolicy, StatelessHash,
-                       std::equal_to<absl::string_view>, std::allocator<int>>));
+  if (std::is_empty<HashtablezInfoHandle>::value) {
+    EXPECT_EQ(sizeof(MockTableInfozDisabled),
+              sizeof(raw_hash_set<StringPolicy, StatelessHash,
+                                  std::equal_to<absl::string_view>,
+                                  std::allocator<int>>));
+
+    EXPECT_EQ(sizeof(MockTableInfozDisabled) + sizeof(StatefulHash),
+              sizeof(raw_hash_set<StringPolicy, StatefulHash,
+                                  std::equal_to<absl::string_view>,
+                                  std::allocator<int>>));
+  } else {
+    EXPECT_EQ(sizeof(MockTable),
+              sizeof(raw_hash_set<StringPolicy, StatelessHash,
+                                  std::equal_to<absl::string_view>,
+                                  std::allocator<int>>));
 
-  EXPECT_EQ(
-      sizeof(MockTable) + sizeof(StatefulHash),
-      sizeof(
-          raw_hash_set<StringPolicy, StatefulHash,
-                       std::equal_to<absl::string_view>, std::allocator<int>>));
+    EXPECT_EQ(sizeof(MockTable) + sizeof(StatefulHash),
+              sizeof(raw_hash_set<StringPolicy, StatefulHash,
+                                  std::equal_to<absl::string_view>,
+                                  std::allocator<int>>));
+  }
 }
 
 TEST(Table, Empty) {
@@ -524,6 +541,37 @@ TEST(Table, InsertCollisionAndFindAfterDelete) {
   EXPECT_TRUE(t.empty());
 }
 
+TEST(Table, InsertWithinCapacity) {
+  IntTable t;
+  t.reserve(10);
+  const size_t original_capacity = t.capacity();
+  const auto addr = [&](int i) {
+    return reinterpret_cast<uintptr_t>(&*t.find(i));
+  };
+  // Inserting an element does not change capacity.
+  t.insert(0);
+  EXPECT_THAT(t.capacity(), original_capacity);
+  const uintptr_t original_addr_0 = addr(0);
+  // Inserting another element does not rehash.
+  t.insert(1);
+  EXPECT_THAT(t.capacity(), original_capacity);
+  EXPECT_THAT(addr(0), original_addr_0);
+  // Inserting lots of duplicate elements does not rehash.
+  for (int i = 0; i < 100; ++i) {
+    t.insert(i % 10);
+  }
+  EXPECT_THAT(t.capacity(), original_capacity);
+  EXPECT_THAT(addr(0), original_addr_0);
+  // Inserting a range of duplicate elements does not rehash.
+  std::vector<int> dup_range;
+  for (int i = 0; i < 100; ++i) {
+    dup_range.push_back(i % 10);
+  }
+  t.insert(dup_range.begin(), dup_range.end());
+  EXPECT_THAT(t.capacity(), original_capacity);
+  EXPECT_THAT(addr(0), original_addr_0);
+}
+
 TEST(Table, LazyEmplace) {
   StringTable t;
   bool called = false;
diff --git a/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h b/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h
index 3f90ad7ca8..c1d20f3c52 100644
--- a/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h
+++ b/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h
@@ -179,7 +179,7 @@ TYPED_TEST_P(ConstructorTest, InputIteratorBucketHashEqualAlloc) {
   A alloc(0);
   std::vector<T> values;
   std::generate_n(std::back_inserter(values), 10,
-                  hash_internal::Generator<T>());
+                  hash_internal::UniqueGenerator<T>());
   TypeParam m(values.begin(), values.end(), 123, hasher, equal, alloc);
   EXPECT_EQ(m.hash_function(), hasher);
   EXPECT_EQ(m.key_eq(), equal);
@@ -198,7 +198,7 @@ void InputIteratorBucketAllocTest(std::true_type) {
   A alloc(0);
   std::vector<T> values;
   std::generate_n(std::back_inserter(values), 10,
-                  hash_internal::Generator<T>());
+                  hash_internal::UniqueGenerator<T>());
   TypeParam m(values.begin(), values.end(), 123, alloc);
   EXPECT_EQ(m.get_allocator(), alloc);
   EXPECT_THAT(items(m), ::testing::UnorderedElementsAreArray(values));
@@ -221,7 +221,7 @@ void InputIteratorBucketHashAllocTest(std::true_type) {
   A alloc(0);
   std::vector<T> values;
   std::generate_n(std::back_inserter(values), 10,
-                  hash_internal::Generator<T>());
+                  hash_internal::UniqueGenerator<T>());
   TypeParam m(values.begin(), values.end(), 123, hasher, alloc);
   EXPECT_EQ(m.hash_function(), hasher);
   EXPECT_EQ(m.get_allocator(), alloc);
@@ -241,8 +241,9 @@ TYPED_TEST_P(ConstructorTest, CopyConstructor) {
   H hasher;
   E equal;
   A alloc(0);
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m(123, hasher, equal, alloc);
-  for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()());
+  for (size_t i = 0; i != 10; ++i) m.insert(gen());
   TypeParam n(m);
   EXPECT_EQ(m.hash_function(), n.hash_function());
   EXPECT_EQ(m.key_eq(), n.key_eq());
@@ -262,8 +263,9 @@ void CopyConstructorAllocTest(std::true_type) {
   H hasher;
   E equal;
   A alloc(0);
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m(123, hasher, equal, alloc);
-  for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()());
+  for (size_t i = 0; i != 10; ++i) m.insert(gen());
   TypeParam n(m, A(11));
   EXPECT_EQ(m.hash_function(), n.hash_function());
   EXPECT_EQ(m.key_eq(), n.key_eq());
@@ -285,8 +287,9 @@ TYPED_TEST_P(ConstructorTest, MoveConstructor) {
   H hasher;
   E equal;
   A alloc(0);
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m(123, hasher, equal, alloc);
-  for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()());
+  for (size_t i = 0; i != 10; ++i) m.insert(gen());
   TypeParam t(m);
   TypeParam n(std::move(t));
   EXPECT_EQ(m.hash_function(), n.hash_function());
@@ -307,8 +310,9 @@ void MoveConstructorAllocTest(std::true_type) {
   H hasher;
   E equal;
   A alloc(0);
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m(123, hasher, equal, alloc);
-  for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()());
+  for (size_t i = 0; i != 10; ++i) m.insert(gen());
   TypeParam t(m);
   TypeParam n(std::move(t), A(1));
   EXPECT_EQ(m.hash_function(), n.hash_function());
@@ -325,7 +329,7 @@ TYPED_TEST_P(ConstructorTest, MoveConstructorAlloc) {
 
 TYPED_TEST_P(ConstructorTest, InitializerListBucketHashEqualAlloc) {
   using T = hash_internal::GeneratedType<TypeParam>;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()};
   using H = typename TypeParam::hasher;
   using E = typename TypeParam::key_equal;
@@ -348,7 +352,7 @@ template <typename TypeParam>
 void InitializerListBucketAllocTest(std::true_type) {
   using T = hash_internal::GeneratedType<TypeParam>;
   using A = typename TypeParam::allocator_type;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()};
   A alloc(0);
   TypeParam m(values, 123, alloc);
@@ -371,7 +375,7 @@ void InitializerListBucketHashAllocTest(std::true_type) {
   using A = typename TypeParam::allocator_type;
   H hasher;
   A alloc(0);
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()};
   TypeParam m(values, 123, hasher, alloc);
   EXPECT_EQ(m.hash_function(), hasher);
@@ -392,7 +396,7 @@ TYPED_TEST_P(ConstructorTest, Assignment) {
   H hasher;
   E equal;
   A alloc(0);
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m({gen(), gen(), gen()}, 123, hasher, equal, alloc);
   TypeParam n;
   n = m;
@@ -412,7 +416,7 @@ TYPED_TEST_P(ConstructorTest, MoveAssignment) {
   H hasher;
   E equal;
   A alloc(0);
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m({gen(), gen(), gen()}, 123, hasher, equal, alloc);
   TypeParam t(m);
   TypeParam n;
@@ -424,7 +428,7 @@ TYPED_TEST_P(ConstructorTest, MoveAssignment) {
 
 TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerList) {
   using T = hash_internal::GeneratedType<TypeParam>;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()};
   TypeParam m;
   m = values;
@@ -433,7 +437,7 @@ TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerList) {
 
 TYPED_TEST_P(ConstructorTest, AssignmentOverwritesExisting) {
   using T = hash_internal::GeneratedType<TypeParam>;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m({gen(), gen(), gen()});
   TypeParam n({gen()});
   n = m;
@@ -442,7 +446,7 @@ TYPED_TEST_P(ConstructorTest, AssignmentOverwritesExisting) {
 
 TYPED_TEST_P(ConstructorTest, MoveAssignmentOverwritesExisting) {
   using T = hash_internal::GeneratedType<TypeParam>;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   TypeParam m({gen(), gen(), gen()});
   TypeParam t(m);
   TypeParam n({gen()});
@@ -452,7 +456,7 @@ TYPED_TEST_P(ConstructorTest, MoveAssignmentOverwritesExisting) {
 
 TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerListOverwritesExisting) {
   using T = hash_internal::GeneratedType<TypeParam>;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()};
   TypeParam m;
   m = values;
@@ -461,7 +465,7 @@ TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerListOverwritesExisting) {
 
 TYPED_TEST_P(ConstructorTest, AssignmentOnSelf) {
   using T = hash_internal::GeneratedType<TypeParam>;
-  hash_internal::Generator<T> gen;
+  hash_internal::UniqueGenerator<T> gen;
   std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()};
   TypeParam m(values);
   m = *&m;  // Avoid -Wself-assign
diff --git a/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h b/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h
index 8c9ca779a4..d3543936f7 100644
--- a/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h
+++ b/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h
@@ -81,6 +81,38 @@ TYPED_TEST_P(ModifiersTest, InsertRange) {
   ASSERT_THAT(items(m), ::testing::UnorderedElementsAreArray(values));
 }
 
+TYPED_TEST_P(ModifiersTest, InsertWithinCapacity) {
+  using T = hash_internal::GeneratedType<TypeParam>;
+  using V = typename TypeParam::mapped_type;
+  T val = hash_internal::Generator<T>()();
+  TypeParam m;
+  m.reserve(10);
+  const size_t original_capacity = m.bucket_count();
+  m.insert(val);
+  EXPECT_EQ(m.bucket_count(), original_capacity);
+  T val2 = {val.first, hash_internal::Generator<V>()()};
+  m.insert(val2);
+  EXPECT_EQ(m.bucket_count(), original_capacity);
+}
+
+TYPED_TEST_P(ModifiersTest, InsertRangeWithinCapacity) {
+#if !defined(__GLIBCXX__)
+  using T = hash_internal::GeneratedType<TypeParam>;
+  std::vector<T> base_values;
+  std::generate_n(std::back_inserter(base_values), 10,
+                  hash_internal::Generator<T>());
+  std::vector<T> values;
+  while (values.size() != 100) {
+    std::copy_n(base_values.begin(), 10, std::back_inserter(values));
+  }
+  TypeParam m;
+  m.reserve(10);
+  const size_t original_capacity = m.bucket_count();
+  m.insert(values.begin(), values.end());
+  EXPECT_EQ(m.bucket_count(), original_capacity);
+#endif
+}
+
 TYPED_TEST_P(ModifiersTest, InsertOrAssign) {
 #ifdef UNORDERED_MAP_CXX17
   using std::get;
@@ -266,9 +298,10 @@ TYPED_TEST_P(ModifiersTest, Swap) {
 // TODO(alkis): Write tests for merge.
 
 REGISTER_TYPED_TEST_CASE_P(ModifiersTest, Clear, Insert, InsertHint,
-                           InsertRange, InsertOrAssign, InsertOrAssignHint,
-                           Emplace, EmplaceHint, TryEmplace, TryEmplaceHint,
-                           Erase, EraseRange, EraseKey, Swap);
+                           InsertRange, InsertWithinCapacity,
+                           InsertRangeWithinCapacity, InsertOrAssign,
+                           InsertOrAssignHint, Emplace, EmplaceHint, TryEmplace,
+                           TryEmplaceHint, Erase, EraseRange, EraseKey, Swap);
 
 template <typename Type>
 struct is_unique_ptr : std::false_type {};
diff --git a/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h b/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h
index 26be58d99f..6e473e45da 100644
--- a/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h
+++ b/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h
@@ -74,6 +74,36 @@ TYPED_TEST_P(ModifiersTest, InsertRange) {
   ASSERT_THAT(keys(m), ::testing::UnorderedElementsAreArray(values));
 }
 
+TYPED_TEST_P(ModifiersTest, InsertWithinCapacity) {
+  using T = hash_internal::GeneratedType<TypeParam>;
+  T val = hash_internal::Generator<T>()();
+  TypeParam m;
+  m.reserve(10);
+  const size_t original_capacity = m.bucket_count();
+  m.insert(val);
+  EXPECT_EQ(m.bucket_count(), original_capacity);
+  m.insert(val);
+  EXPECT_EQ(m.bucket_count(), original_capacity);
+}
+
+TYPED_TEST_P(ModifiersTest, InsertRangeWithinCapacity) {
+#if !defined(__GLIBCXX__)
+  using T = hash_internal::GeneratedType<TypeParam>;
+  std::vector<T> base_values;
+  std::generate_n(std::back_inserter(base_values), 10,
+                  hash_internal::Generator<T>());
+  std::vector<T> values;
+  while (values.size() != 100) {
+    values.insert(values.end(), base_values.begin(), base_values.end());
+  }
+  TypeParam m;
+  m.reserve(10);
+  const size_t original_capacity = m.bucket_count();
+  m.insert(values.begin(), values.end());
+  EXPECT_EQ(m.bucket_count(), original_capacity);
+#endif
+}
+
 TYPED_TEST_P(ModifiersTest, Emplace) {
   using T = hash_internal::GeneratedType<TypeParam>;
   T val = hash_internal::Generator<T>()();
@@ -180,8 +210,9 @@ TYPED_TEST_P(ModifiersTest, Swap) {
 // TODO(alkis): Write tests for merge.
 
 REGISTER_TYPED_TEST_CASE_P(ModifiersTest, Clear, Insert, InsertHint,
-                           InsertRange, Emplace, EmplaceHint, Erase, EraseRange,
-                           EraseKey, Swap);
+                           InsertRange, InsertWithinCapacity,
+                           InsertRangeWithinCapacity, Emplace, EmplaceHint,
+                           Erase, EraseRange, EraseKey, Swap);
 
 }  // namespace container_internal
 ABSL_NAMESPACE_END
diff --git a/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake b/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake
index 9cd6fd1b2a..942ce90a4d 100644
--- a/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake
+++ b/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake
@@ -35,8 +35,7 @@ endif()
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(ABSL_DEFAULT_COPTS "${ABSL_GCC_FLAGS}")
   set(ABSL_TEST_COPTS "${ABSL_GCC_FLAGS};${ABSL_GCC_TEST_FLAGS}")
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  # MATCHES so we get both Clang and AppleClang
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")  # MATCHES so we get both Clang and AppleClang
   if(MSVC)
     # clang-cl is half MSVC, half LLVM
     set(ABSL_DEFAULT_COPTS "${ABSL_CLANG_CL_FLAGS}")
diff --git a/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake b/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake
index 51742c9b6b..22a25eba7f 100644
--- a/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake
+++ b/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake
@@ -71,12 +71,13 @@ list(APPEND ABSL_LLVM_FLAGS
     "-Wformat-security"
     "-Wgnu-redeclared-enum"
     "-Winfinite-recursion"
+    "-Winvalid-constexpr"
     "-Wliteral-conversion"
     "-Wmissing-declarations"
     "-Woverlength-strings"
     "-Wpointer-arith"
     "-Wself-assign"
-    "-Wshadow"
+    "-Wshadow-all"
     "-Wstring-conversion"
     "-Wtautological-overlap-compare"
     "-Wundef"
diff --git a/third_party/abseil-cpp/absl/debugging/CMakeLists.txt b/third_party/abseil-cpp/absl/debugging/CMakeLists.txt
index 074b44cf17..bb4d4c92da 100644
--- a/third_party/abseil-cpp/absl/debugging/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/debugging/CMakeLists.txt
@@ -87,7 +87,7 @@ absl_cc_test(
     absl::memory
     absl::raw_logging_internal
     absl::strings
-    gmock
+    GTest::gmock
 )
 
 absl_cc_library(
@@ -141,7 +141,7 @@ absl_cc_test(
     absl::strings
     absl::raw_logging_internal
     Threads::Threads
-    gmock
+    GTest::gmock
 )
 
 absl_cc_library(
@@ -194,7 +194,7 @@ absl_cc_test(
     absl::core_headers
     absl::memory
     absl::raw_logging_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -261,7 +261,7 @@ absl_cc_test(
   DEPS
     absl::leak_check_api_enabled_for_testing
     absl::base
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -275,7 +275,7 @@ absl_cc_test(
   DEPS
     absl::leak_check_api_disabled_for_testing
     absl::base
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -292,7 +292,7 @@ absl_cc_test(
     absl::leak_check_disable
     absl::base
     absl::raw_logging_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -322,7 +322,7 @@ absl_cc_test(
     absl::stack_consumption
     absl::core_headers
     absl::raw_logging_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 # component target
diff --git a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc
index a9ed6ef964..689e5979e7 100644
--- a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc
+++ b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc
@@ -136,7 +136,8 @@ static bool SetupAlternateStackOnce() {
 #else
   const size_t page_mask = sysconf(_SC_PAGESIZE) - 1;
 #endif
-  size_t stack_size = (std::max(SIGSTKSZ, 65536) + page_mask) & ~page_mask;
+  size_t stack_size =
+      (std::max<size_t>(SIGSTKSZ, 65536) + page_mask) & ~page_mask;
 #if defined(ABSL_HAVE_ADDRESS_SANITIZER) || \
     defined(ABSL_HAVE_MEMORY_SANITIZER) || defined(ABSL_HAVE_THREAD_SANITIZER)
   // Account for sanitizer instrumentation requiring additional stack space.
@@ -366,6 +367,7 @@ static void AbslFailureSignalHandler(int signo, siginfo_t*, void* ucontext) {
   // goes after this point.
   if (fsh_options.writerfn != nullptr) {
     WriteFailureInfo(signo, ucontext, my_cpu, fsh_options.writerfn);
+    fsh_options.writerfn(nullptr);
   }
 
   if (fsh_options.call_previous_handler) {
diff --git a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h
index 0c0f585d0f..500115c0ab 100644
--- a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h
+++ b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h
@@ -90,7 +90,7 @@ struct FailureSignalHandlerOptions {
   // If non-null, indicates a pointer to a callback function that will be called
   // upon failure, with a string argument containing failure data. This function
   // may be used as a hook to write failure data to a secondary location, such
-  // as a log file. This function may also be called with null data, as a hint
+  // as a log file. This function will also be called with null data, as a hint
   // to flush any buffered data before the program may be terminated. Consider
   // flushing any buffered data in all calls to this function.
   //
diff --git a/third_party/abseil-cpp/absl/debugging/internal/demangle.cc b/third_party/abseil-cpp/absl/debugging/internal/demangle.cc
index 46cdb67b1f..5cd563208e 100644
--- a/third_party/abseil-cpp/absl/debugging/internal/demangle.cc
+++ b/third_party/abseil-cpp/absl/debugging/internal/demangle.cc
@@ -386,24 +386,28 @@ static bool IsDigit(char c) { return c >= '0' && c <= '9'; }
 // by GCC 4.5.x and later versions (and our locally-modified version of GCC
 // 4.4.x) to indicate functions which have been cloned during optimization.
 // We treat any sequence (.<alpha>+.<digit>+)+ as a function clone suffix.
+// Additionally, '_' is allowed along with the alphanumeric sequence.
 static bool IsFunctionCloneSuffix(const char *str) {
   size_t i = 0;
   while (str[i] != '\0') {
-    // Consume a single .<alpha>+.<digit>+ sequence.
-    if (str[i] != '.' || !IsAlpha(str[i + 1])) {
-      return false;
+    bool parsed = false;
+    // Consume a single [.<alpha> | _]*[.<digit>]* sequence.
+    if (str[i] == '.' && (IsAlpha(str[i + 1]) || str[i + 1] == '_')) {
+      parsed = true;
+      i += 2;
+      while (IsAlpha(str[i]) || str[i] == '_') {
+        ++i;
+      }
     }
-    i += 2;
-    while (IsAlpha(str[i])) {
-      ++i;
+    if (str[i] == '.' && IsDigit(str[i + 1])) {
+      parsed = true;
+      i += 2;
+      while (IsDigit(str[i])) {
+        ++i;
+      }
     }
-    if (str[i] != '.' || !IsDigit(str[i + 1])) {
+    if (!parsed)
       return false;
-    }
-    i += 2;
-    while (IsDigit(str[i])) {
-      ++i;
-    }
   }
   return true;  // Consumed everything in "str".
 }
diff --git a/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc b/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc
index 0bed7359d8..6b142902ca 100644
--- a/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc
+++ b/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc
@@ -70,12 +70,34 @@ TEST(Demangle, Clones) {
   EXPECT_STREQ("Foo()", tmp);
   EXPECT_TRUE(Demangle("_ZL3Foov.isra.2.constprop.18", tmp, sizeof(tmp)));
   EXPECT_STREQ("Foo()", tmp);
-  // Invalid (truncated), should not demangle.
-  EXPECT_FALSE(Demangle("_ZL3Foov.clo", tmp, sizeof(tmp)));
+  // Demangle suffixes produced by -funique-internal-linkage-names.
+  EXPECT_TRUE(Demangle("_ZL3Foov.__uniq.12345", tmp, sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  EXPECT_TRUE(Demangle("_ZL3Foov.__uniq.12345.isra.2.constprop.18", tmp,
+                       sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  // Suffixes without the number should also demangle.
+  EXPECT_TRUE(Demangle("_ZL3Foov.clo", tmp, sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  // Suffixes with just the number should also demangle.
+  EXPECT_TRUE(Demangle("_ZL3Foov.123", tmp, sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  // (.clone. followed by non-number), should also demangle.
+  EXPECT_TRUE(Demangle("_ZL3Foov.clone.foo", tmp, sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  // (.clone. followed by multiple numbers), should also demangle.
+  EXPECT_TRUE(Demangle("_ZL3Foov.clone.123.456", tmp, sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  // (a long valid suffix), should demangle.
+  EXPECT_TRUE(Demangle("_ZL3Foov.part.9.165493.constprop.775.31805", tmp,
+                       sizeof(tmp)));
+  EXPECT_STREQ("Foo()", tmp);
+  // Invalid (. without anything else), should not demangle.
+  EXPECT_FALSE(Demangle("_ZL3Foov.", tmp, sizeof(tmp)));
+  // Invalid (. with mix of alpha and digits), should not demangle.
+  EXPECT_FALSE(Demangle("_ZL3Foov.abc123", tmp, sizeof(tmp)));
   // Invalid (.clone. not followed by number), should not demangle.
   EXPECT_FALSE(Demangle("_ZL3Foov.clone.", tmp, sizeof(tmp)));
-  // Invalid (.clone. followed by non-number), should not demangle.
-  EXPECT_FALSE(Demangle("_ZL3Foov.clone.foo", tmp, sizeof(tmp)));
   // Invalid (.constprop. not followed by number), should not demangle.
   EXPECT_FALSE(Demangle("_ZL3Foov.isra.2.constprop.", tmp, sizeof(tmp)));
 }
diff --git a/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc b/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc
index bc320ff75b..70f79dfcb8 100644
--- a/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc
+++ b/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc
@@ -132,9 +132,8 @@ static uintptr_t GetFP(const void *vuc) {
     const uintptr_t bp = 0;
     const uintptr_t sp = 0;
 #endif
-    // Sanity-check that the base pointer is valid.  It should be as long as
-    // SHRINK_WRAP_FRAME_POINTER is not set, but it's possible that some code in
-    // the process is compiled with --copt=-fomit-frame-pointer or
+    // Sanity-check that the base pointer is valid. It's possible that some
+    // code in the process is compiled with --copt=-fomit-frame-pointer or
     // --copt=-momit-leaf-frame-pointer.
     //
     // TODO(bcmills): -momit-leaf-frame-pointer is currently the default
@@ -247,7 +246,7 @@ static void **NextStackFrame(void **old_fp, const void *uc) {
   // using an alternate signal stack.
   //
   // TODO(bcmills): The GetFP call should be completely unnecessary when
-  // SHRINK_WRAP_FRAME_POINTER is set (because we should be back in the thread's
+  // ENABLE_COMBINED_UNWINDER is set (because we should be back in the thread's
   // stack by this point), but it is empirically still needed (e.g. when the
   // stack includes a call to abort).  unw_get_reg returns UNW_EBADREG for some
   // frames.  Figure out why GetValidFrameAddr and/or libunwind isn't doing what
diff --git a/third_party/abseil-cpp/absl/debugging/leak_check.cc b/third_party/abseil-cpp/absl/debugging/leak_check.cc
index ff9049559d..764ca0ad00 100644
--- a/third_party/abseil-cpp/absl/debugging/leak_check.cc
+++ b/third_party/abseil-cpp/absl/debugging/leak_check.cc
@@ -16,6 +16,7 @@
 // When lsan is not linked in, these functions are not available,
 // therefore Abseil code which depends on these functions is conditioned on the
 // definition of LEAK_SANITIZER.
+#include "absl/base/attributes.h"
 #include "absl/debugging/leak_check.h"
 
 #ifndef LEAK_SANITIZER
@@ -23,6 +24,7 @@
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 bool HaveLeakSanitizer() { return false; }
+bool LeakCheckerIsActive() { return false; }
 void DoIgnoreLeak(const void*) { }
 void RegisterLivePointers(const void*, size_t) { }
 void UnRegisterLivePointers(const void*, size_t) { }
@@ -35,9 +37,23 @@ ABSL_NAMESPACE_END
 
 #include <sanitizer/lsan_interface.h>
 
+#if ABSL_HAVE_ATTRIBUTE_WEAK
+extern "C" ABSL_ATTRIBUTE_WEAK int __lsan_is_turned_off();
+#endif
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 bool HaveLeakSanitizer() { return true; }
+
+#if ABSL_HAVE_ATTRIBUTE_WEAK
+bool LeakCheckerIsActive() {
+  return !(&__lsan_is_turned_off && __lsan_is_turned_off());
+}
+#else
+bool LeakCheckerIsActive() { return true; }
+#endif
+
+bool FindAndReportLeaks() { return __lsan_do_recoverable_leak_check(); }
 void DoIgnoreLeak(const void* ptr) { __lsan_ignore_object(ptr); }
 void RegisterLivePointers(const void* ptr, size_t size) {
   __lsan_register_root_region(ptr, size);
diff --git a/third_party/abseil-cpp/absl/debugging/leak_check.h b/third_party/abseil-cpp/absl/debugging/leak_check.h
index b66a81c3bc..5fc2b052e4 100644
--- a/third_party/abseil-cpp/absl/debugging/leak_check.h
+++ b/third_party/abseil-cpp/absl/debugging/leak_check.h
@@ -43,6 +43,12 @@ ABSL_NAMESPACE_BEGIN
 // currently built into this target.
 bool HaveLeakSanitizer();
 
+// LeakCheckerIsActive()
+//
+// Returns true if a leak-checking sanitizer (either ASan or standalone LSan) is
+// currently built into this target and is turned on.
+bool LeakCheckerIsActive();
+
 // DoIgnoreLeak()
 //
 // Implements `IgnoreLeak()` below. This function should usually
@@ -71,6 +77,19 @@ T* IgnoreLeak(T* ptr) {
   return ptr;
 }
 
+// FindAndReportLeaks()
+//
+// If any leaks are detected, prints a leak report and returns true.  This
+// function may be called repeatedly, and does not affect end-of-process leak
+// checking.
+//
+// Example:
+// if (FindAndReportLeaks()) {
+//   ... diagnostic already printed. Exit with failure code.
+//   exit(1)
+// }
+bool FindAndReportLeaks();
+
 // LeakCheckDisabler
 //
 // This helper class indicates that any heap allocations done in the code block
diff --git a/third_party/abseil-cpp/absl/debugging/leak_check_test.cc b/third_party/abseil-cpp/absl/debugging/leak_check_test.cc
index b5cc487488..9fcfc8e50b 100644
--- a/third_party/abseil-cpp/absl/debugging/leak_check_test.cc
+++ b/third_party/abseil-cpp/absl/debugging/leak_check_test.cc
@@ -23,8 +23,10 @@ namespace {
 TEST(LeakCheckTest, DetectLeakSanitizer) {
 #ifdef ABSL_EXPECT_LEAK_SANITIZER
   EXPECT_TRUE(absl::HaveLeakSanitizer());
+  EXPECT_TRUE(absl::LeakCheckerIsActive());
 #else
   EXPECT_FALSE(absl::HaveLeakSanitizer());
+  EXPECT_FALSE(absl::LeakCheckerIsActive());
 #endif
 }
 
diff --git a/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc b/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc
index f4d5727bde..87dbd078b9 100644
--- a/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc
+++ b/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc
@@ -701,6 +701,16 @@ static ABSL_ATTRIBUTE_NOINLINE FindSymbolResult FindSymbol(
       const char *start_address =
           ComputeOffset(original_start_address, relocation);
 
+#ifdef __arm__
+      // ARM functions are always aligned to multiples of two bytes; the
+      // lowest-order bit in start_address is ignored by the CPU and indicates
+      // whether the function contains ARM (0) or Thumb (1) code. We don't care
+      // about what encoding is being used; we just want the real start address
+      // of the function.
+      start_address = reinterpret_cast<const char *>(
+          reinterpret_cast<uintptr_t>(start_address) & ~1);
+#endif
+
       if (deref_function_descriptor_pointer &&
           InSection(original_start_address, opd)) {
         // The opd section is mapped into memory.  Just dereference
diff --git a/third_party/abseil-cpp/absl/debugging/symbolize_test.cc b/third_party/abseil-cpp/absl/debugging/symbolize_test.cc
index a2dd4956c4..35de02e24b 100644
--- a/third_party/abseil-cpp/absl/debugging/symbolize_test.cc
+++ b/third_party/abseil-cpp/absl/debugging/symbolize_test.cc
@@ -477,6 +477,46 @@ void ABSL_ATTRIBUTE_NOINLINE TestWithReturnAddress() {
 #endif
 }
 
+#if defined(__arm__) && ABSL_HAVE_ATTRIBUTE(target)
+// Test that we correctly identify bounds of Thumb functions on ARM.
+//
+// Thumb functions have the lowest-order bit set in their addresses in the ELF
+// symbol table. This requires some extra logic to properly compute function
+// bounds. To test this logic, nudge a Thumb function right up against an ARM
+// function and try to symbolize the ARM function.
+//
+// A naive implementation will simply use the Thumb function's entry point as
+// written in the symbol table and will therefore treat the Thumb function as
+// extending one byte further in the instruction stream than it actually does.
+// When asked to symbolize the start of the ARM function, it will identify an
+// overlap between the Thumb and ARM functions, and it will return the name of
+// the Thumb function.
+//
+// A correct implementation, on the other hand, will null out the lowest-order
+// bit in the Thumb function's entry point. It will correctly compute the end of
+// the Thumb function, it will find no overlap between the Thumb and ARM
+// functions, and it will return the name of the ARM function.
+
+__attribute__((target("thumb"))) int ArmThumbOverlapThumb(int x) {
+  return x * x * x;
+}
+
+__attribute__((target("arm"))) int ArmThumbOverlapArm(int x) {
+  return x * x * x;
+}
+
+void ABSL_ATTRIBUTE_NOINLINE TestArmThumbOverlap() {
+#if defined(ABSL_HAVE_ATTRIBUTE_NOINLINE)
+  const char *symbol = TrySymbolize((void *)&ArmThumbOverlapArm);
+  ABSL_RAW_CHECK(symbol != nullptr, "TestArmThumbOverlap failed");
+  ABSL_RAW_CHECK(strcmp("ArmThumbOverlapArm()", symbol) == 0,
+                 "TestArmThumbOverlap failed");
+  std::cout << "TestArmThumbOverlap passed" << std::endl;
+#endif
+}
+
+#endif  // defined(__arm__) && ABSL_HAVE_ATTRIBUTE(target)
+
 #elif defined(_WIN32)
 #if !defined(ABSL_CONSUME_DLL)
 
@@ -551,6 +591,9 @@ int main(int argc, char **argv) {
   TestWithPCInsideInlineFunction();
   TestWithPCInsideNonInlineFunction();
   TestWithReturnAddress();
+#if defined(__arm__) && ABSL_HAVE_ATTRIBUTE(target)
+  TestArmThumbOverlap();
+#endif
 #endif
 
   return RUN_ALL_TESTS();
diff --git a/third_party/abseil-cpp/absl/flags/CMakeLists.txt b/third_party/abseil-cpp/absl/flags/CMakeLists.txt
index caac69cf89..956f70f868 100644
--- a/third_party/abseil-cpp/absl/flags/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/flags/CMakeLists.txt
@@ -239,6 +239,7 @@ absl_cc_library(
     absl::flags_private_handle_accessor
     absl::flags_program_name
     absl::flags_reflection
+    absl::flat_hash_map
     absl::strings
     absl::synchronization
 )
@@ -309,7 +310,7 @@ absl_cc_test(
     absl::flags_reflection
     absl::memory
     absl::strings
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -321,7 +322,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::flags_config
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -341,7 +342,7 @@ absl_cc_test(
     absl::flags_reflection
     absl::strings
     absl::time
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -353,7 +354,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::flags_marshalling
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -372,7 +373,7 @@ absl_cc_test(
     absl::scoped_set_env
     absl::span
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -384,7 +385,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::flags_path_util
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -397,7 +398,7 @@ absl_cc_test(
   DEPS
     absl::flags_program_name
     absl::strings
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -414,7 +415,7 @@ absl_cc_test(
     absl::flags_usage
     absl::memory
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -428,7 +429,7 @@ absl_cc_test(
     absl::base
     absl::flags_internal
     absl::time
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -443,7 +444,7 @@ absl_cc_test(
     absl::flags_path_util
     absl::flags_program_name
     absl::strings
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -462,5 +463,5 @@ absl_cc_test(
     absl::flags_reflection
     absl::flags_usage
     absl::strings
-    gtest
+    GTest::gtest
 )
diff --git a/third_party/abseil-cpp/absl/flags/flag.h b/third_party/abseil-cpp/absl/flags/flag.h
index f09580b06a..14209e7ba7 100644
--- a/third_party/abseil-cpp/absl/flags/flag.h
+++ b/third_party/abseil-cpp/absl/flags/flag.h
@@ -265,6 +265,8 @@ ABSL_NAMESPACE_END
 //
 //   ABSL_FLAG(T, name, default_value, help).OnUpdate(callback);
 //
+// `callback` should be convertible to `void (*)()`.
+//
 // After any setting of the flag value, the callback will be called at least
 // once. A rapid sequence of changes may be merged together into the same
 // callback. No concurrent calls to the callback will be made for the same
@@ -279,7 +281,6 @@ ABSL_NAMESPACE_END
 // Note: ABSL_FLAG.OnUpdate() does not have a public definition. Hence, this
 // comment serves as its API documentation.
 
-
 // -----------------------------------------------------------------------------
 // Implementation details below this section
 // -----------------------------------------------------------------------------
diff --git a/third_party/abseil-cpp/absl/flags/internal/usage.cc b/third_party/abseil-cpp/absl/flags/internal/usage.cc
index a588c7f73a..949709e883 100644
--- a/third_party/abseil-cpp/absl/flags/internal/usage.cc
+++ b/third_party/abseil-cpp/absl/flags/internal/usage.cc
@@ -245,7 +245,7 @@ void FlagsHelpImpl(std::ostream& out, PerFlagFilter filter_cb,
         << XMLElement("usage", program_usage_message) << '\n';
   }
 
-  // Map of package name to
+  // Ordered map of package name to
   //   map of file name to
   //     vector of flags in the file.
   // This map is used to output matching flags grouped by package and file
@@ -273,20 +273,26 @@ void FlagsHelpImpl(std::ostream& out, PerFlagFilter filter_cb,
 
   absl::string_view package_separator;  // controls blank lines between packages
   absl::string_view file_separator;     // controls blank lines between files
-  for (const auto& package : matching_flags) {
+  for (auto& package : matching_flags) {
     if (format == HelpFormat::kHumanReadable) {
       out << package_separator;
       package_separator = "\n\n";
     }
 
     file_separator = "";
-    for (const auto& flags_in_file : package.second) {
+    for (auto& flags_in_file : package.second) {
       if (format == HelpFormat::kHumanReadable) {
         out << file_separator << "  Flags from " << flags_in_file.first
             << ":\n";
         file_separator = "\n";
       }
 
+      std::sort(std::begin(flags_in_file.second),
+                std::end(flags_in_file.second),
+                [](const CommandLineFlag* lhs, const CommandLineFlag* rhs) {
+                  return lhs->Name() < rhs->Name();
+                });
+
       for (const auto* flag : flags_in_file.second) {
         flags_internal::FlagHelp(out, *flag, format);
       }
diff --git a/third_party/abseil-cpp/absl/flags/reflection.cc b/third_party/abseil-cpp/absl/flags/reflection.cc
index 0c76110163..dbce4032ab 100644
--- a/third_party/abseil-cpp/absl/flags/reflection.cc
+++ b/third_party/abseil-cpp/absl/flags/reflection.cc
@@ -18,11 +18,11 @@
 #include <assert.h>
 
 #include <atomic>
-#include <map>
 #include <string>
 
 #include "absl/base/config.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/flags/commandlineflag.h"
 #include "absl/flags/internal/private_handle_accessor.h"
 #include "absl/flags/internal/registry.h"
@@ -68,7 +68,7 @@ class FlagRegistry {
   friend void FinalizeRegistry();
 
   // The map from name to flag, for FindFlag().
-  using FlagMap = std::map<absl::string_view, CommandLineFlag*>;
+  using FlagMap = absl::flat_hash_map<absl::string_view, CommandLineFlag*>;
   using FlagIterator = FlagMap::iterator;
   using FlagConstIterator = FlagMap::const_iterator;
   FlagMap flags_;
@@ -204,6 +204,10 @@ void FinalizeRegistry() {
   for (const auto& f : registry.flags_) {
     registry.flat_flags_.push_back(f.second);
   }
+  std::sort(std::begin(registry.flat_flags_), std::end(registry.flat_flags_),
+            [](const CommandLineFlag* lhs, const CommandLineFlag* rhs) {
+              return lhs->Name() < rhs->Name();
+            });
   registry.flags_.clear();
   registry.finalized_flags_.store(true, std::memory_order_release);
 }
diff --git a/third_party/abseil-cpp/absl/functional/CMakeLists.txt b/third_party/abseil-cpp/absl/functional/CMakeLists.txt
index cda914f2cd..3919e9a1de 100644
--- a/third_party/abseil-cpp/absl/functional/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/functional/CMakeLists.txt
@@ -39,7 +39,7 @@ absl_cc_test(
   DEPS
     absl::bind_front
     absl::memory
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -68,5 +68,5 @@ absl_cc_test(
     absl::function_ref
     absl::memory
     absl::test_instance_tracker
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/functional/function_ref.h b/third_party/abseil-cpp/absl/functional/function_ref.h
index 6e03ac2e04..5790a65251 100644
--- a/third_party/abseil-cpp/absl/functional/function_ref.h
+++ b/third_party/abseil-cpp/absl/functional/function_ref.h
@@ -122,6 +122,7 @@ class FunctionRef<R(Args...)> {
   // To help prevent subtle lifetime bugs, FunctionRef is not assignable.
   // Typically, it should only be used as an argument type.
   FunctionRef& operator=(const FunctionRef& rhs) = delete;
+  FunctionRef(const FunctionRef& rhs) = default;
 
   // Call the underlying object.
   R operator()(Args... args) const {
diff --git a/third_party/abseil-cpp/absl/hash/CMakeLists.txt b/third_party/abseil-cpp/absl/hash/CMakeLists.txt
index b43bfa542f..c82f66f02c 100644
--- a/third_party/abseil-cpp/absl/hash/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/hash/CMakeLists.txt
@@ -52,7 +52,7 @@ absl_cc_library(
     absl::meta
     absl::strings
     absl::variant
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -72,7 +72,7 @@ absl_cc_test(
     absl::spy_hash_state
     absl::meta
     absl::int128
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -113,7 +113,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::city
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -141,5 +141,5 @@ absl_cc_test(
   DEPS
     absl::wyhash
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/hash/hash.h b/third_party/abseil-cpp/absl/hash/hash.h
index 5de132cac8..8282ea53c6 100644
--- a/third_party/abseil-cpp/absl/hash/hash.h
+++ b/third_party/abseil-cpp/absl/hash/hash.h
@@ -73,6 +73,8 @@
 #ifndef ABSL_HASH_HASH_H_
 #define ABSL_HASH_HASH_H_
 
+#include <tuple>
+
 #include "absl/hash/internal/hash.h"
 
 namespace absl {
@@ -214,6 +216,26 @@ ABSL_NAMESPACE_BEGIN
 template <typename T>
 using Hash = absl::hash_internal::Hash<T>;
 
+// HashOf
+//
+// absl::HashOf() is a helper that generates a hash from the values of its
+// arguments.  It dispatches to absl::Hash directly, as follows:
+//  * HashOf(t) == absl::Hash<T>{}(t)
+//  * HashOf(a, b, c) == HashOf(std::make_tuple(a, b, c))
+//
+// HashOf(a1, a2, ...) == HashOf(b1, b2, ...) is guaranteed when
+//  * The argument lists have pairwise identical C++ types
+//  * a1 == b1 && a2 == b2 && ...
+//
+// The requirement that the arguments match in both type and value is critical.
+// It means that `a == b` does not necessarily imply `HashOf(a) == HashOf(b)` if
+// `a` and `b` have different types. For example, `HashOf(2) != HashOf(2.0)`.
+template <int&... ExplicitArgumentBarrier, typename... Types>
+size_t HashOf(const Types&... values) {
+  auto tuple = std::tie(values...);
+  return absl::Hash<decltype(tuple)>{}(tuple);
+}
+
 // HashState
 //
 // A type erased version of the hash state concept, for use in user-defined
diff --git a/third_party/abseil-cpp/absl/hash/hash_test.cc b/third_party/abseil-cpp/absl/hash/hash_test.cc
index 1d2e6cf0df..b3ddebdd42 100644
--- a/third_party/abseil-cpp/absl/hash/hash_test.cc
+++ b/third_party/abseil-cpp/absl/hash/hash_test.cc
@@ -973,4 +973,39 @@ TEST(HashTest, DoesNotUseImplicitConversionsToBool) {
             absl::Hash<ValueWithBoolConversion>()(ValueWithBoolConversion{1}));
 }
 
+TEST(HashOf, MatchesHashForSingleArgument) {
+  std::string s = "forty two";
+  int i = 42;
+  double d = 42.0;
+  std::tuple<int, int> t{4, 2};
+
+  EXPECT_EQ(absl::HashOf(s), absl::Hash<std::string>{}(s));
+  EXPECT_EQ(absl::HashOf(i), absl::Hash<int>{}(i));
+  EXPECT_EQ(absl::HashOf(d), absl::Hash<double>{}(d));
+  EXPECT_EQ(absl::HashOf(t), (absl::Hash<std::tuple<int, int>>{}(t)));
+}
+
+TEST(HashOf, MatchesHashOfTupleForMultipleArguments) {
+  std::string hello = "hello";
+  std::string world = "world";
+
+  EXPECT_EQ(absl::HashOf(), absl::HashOf(std::make_tuple()));
+  EXPECT_EQ(absl::HashOf(hello), absl::HashOf(std::make_tuple(hello)));
+  EXPECT_EQ(absl::HashOf(hello, world),
+            absl::HashOf(std::make_tuple(hello, world)));
+}
+
+template <typename T>
+std::true_type HashOfExplicitParameter(decltype(absl::HashOf<T>(0))) {
+  return {};
+}
+template <typename T>
+std::false_type HashOfExplicitParameter(size_t) {
+  return {};
+}
+
+TEST(HashOf, CantPassExplicitTemplateParameters) {
+  EXPECT_FALSE(HashOfExplicitParameter<int>(0));
+}
+
 }  // namespace
diff --git a/third_party/abseil-cpp/absl/hash/internal/hash.cc b/third_party/abseil-cpp/absl/hash/internal/hash.cc
index 1433eb9db3..06f53a59c5 100644
--- a/third_party/abseil-cpp/absl/hash/internal/hash.cc
+++ b/third_party/abseil-cpp/absl/hash/internal/hash.cc
@@ -18,9 +18,8 @@ namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace hash_internal {
 
-uint64_t HashState::CombineLargeContiguousImpl32(uint64_t state,
-                                                 const unsigned char* first,
-                                                 size_t len) {
+uint64_t MixingHashState::CombineLargeContiguousImpl32(
+    uint64_t state, const unsigned char* first, size_t len) {
   while (len >= PiecewiseChunkSize()) {
     state =
         Mix(state, absl::hash_internal::CityHash32(reinterpret_cast<const char*>(first),
@@ -33,9 +32,8 @@ uint64_t HashState::CombineLargeContiguousImpl32(uint64_t state,
                                std::integral_constant<int, 4>{});
 }
 
-uint64_t HashState::CombineLargeContiguousImpl64(uint64_t state,
-                                                 const unsigned char* first,
-                                                 size_t len) {
+uint64_t MixingHashState::CombineLargeContiguousImpl64(
+    uint64_t state, const unsigned char* first, size_t len) {
   while (len >= PiecewiseChunkSize()) {
     state = Mix(state, Hash64(first, PiecewiseChunkSize()));
     len -= PiecewiseChunkSize();
@@ -46,7 +44,7 @@ uint64_t HashState::CombineLargeContiguousImpl64(uint64_t state,
                                std::integral_constant<int, 8>{});
 }
 
-ABSL_CONST_INIT const void* const HashState::kSeed = &kSeed;
+ABSL_CONST_INIT const void* const MixingHashState::kSeed = &kSeed;
 
 // The salt array used by Wyhash. This array is NOT the mechanism used to make
 // absl::Hash non-deterministic between program invocations.  See `Seed()` for
@@ -61,7 +59,7 @@ constexpr uint64_t kWyhashSalt[5] = {
     uint64_t{0x452821E638D01377},
 };
 
-uint64_t HashState::WyhashImpl(const unsigned char* data, size_t len) {
+uint64_t MixingHashState::WyhashImpl(const unsigned char* data, size_t len) {
   return Wyhash(data, len, Seed(), kWyhashSalt);
 }
 
diff --git a/third_party/abseil-cpp/absl/hash/internal/hash.h b/third_party/abseil-cpp/absl/hash/internal/hash.h
index 7fb0af0b96..69dbbc6ba0 100644
--- a/third_party/abseil-cpp/absl/hash/internal/hash.h
+++ b/third_party/abseil-cpp/absl/hash/internal/hash.h
@@ -379,7 +379,7 @@ template <typename H, typename... Ts>
 // This SFINAE gets MSVC confused under some conditions. Let's just disable it
 // for now.
 H
-#else  // _MSC_VER
+#else   // _MSC_VER
 typename std::enable_if<absl::conjunction<is_hashable<Ts>...>::value, H>::type
 #endif  // _MSC_VER
 AbslHashValue(H hash_state, const std::tuple<Ts...>& t) {
@@ -714,8 +714,8 @@ template <typename T>
 struct is_hashable
     : std::integral_constant<bool, HashSelect::template Apply<T>::value> {};
 
-// HashState
-class ABSL_DLL HashState : public HashStateBase<HashState> {
+// MixingHashState
+class ABSL_DLL MixingHashState : public HashStateBase<MixingHashState> {
   // absl::uint128 is not an alias or a thin wrapper around the intrinsic.
   // We use the intrinsic when available to improve performance.
 #ifdef ABSL_HAVE_INTRINSIC_INT128
@@ -734,22 +734,23 @@ class ABSL_DLL HashState : public HashStateBase<HashState> {
 
  public:
   // Move only
-  HashState(HashState&&) = default;
-  HashState& operator=(HashState&&) = default;
+  MixingHashState(MixingHashState&&) = default;
+  MixingHashState& operator=(MixingHashState&&) = default;
 
-  // HashState::combine_contiguous()
+  // MixingHashState::combine_contiguous()
   //
   // Fundamental base case for hash recursion: mixes the given range of bytes
   // into the hash state.
-  static HashState combine_contiguous(HashState hash_state,
-                                      const unsigned char* first, size_t size) {
-    return HashState(
+  static MixingHashState combine_contiguous(MixingHashState hash_state,
+                                            const unsigned char* first,
+                                            size_t size) {
+    return MixingHashState(
         CombineContiguousImpl(hash_state.state_, first, size,
                               std::integral_constant<int, sizeof(size_t)>{}));
   }
-  using HashState::HashStateBase::combine_contiguous;
+  using MixingHashState::HashStateBase::combine_contiguous;
 
-  // HashState::hash()
+  // MixingHashState::hash()
   //
   // For performance reasons in non-opt mode, we specialize this for
   // integral types.
@@ -761,24 +762,24 @@ class ABSL_DLL HashState : public HashStateBase<HashState> {
     return static_cast<size_t>(Mix(Seed(), static_cast<uint64_t>(value)));
   }
 
-  // Overload of HashState::hash()
+  // Overload of MixingHashState::hash()
   template <typename T, absl::enable_if_t<!IntegralFastPath<T>::value, int> = 0>
   static size_t hash(const T& value) {
-    return static_cast<size_t>(combine(HashState{}, value).state_);
+    return static_cast<size_t>(combine(MixingHashState{}, value).state_);
   }
 
  private:
   // Invoked only once for a given argument; that plus the fact that this is
   // move-only ensures that there is only one non-moved-from object.
-  HashState() : state_(Seed()) {}
+  MixingHashState() : state_(Seed()) {}
 
   // Workaround for MSVC bug.
   // We make the type copyable to fix the calling convention, even though we
   // never actually copy it. Keep it private to not affect the public API of the
   // type.
-  HashState(const HashState&) = default;
+  MixingHashState(const MixingHashState&) = default;
 
-  explicit HashState(uint64_t state) : state_(state) {}
+  explicit MixingHashState(uint64_t state) : state_(state) {}
 
   // Implementation of the base case for combine_contiguous where we actually
   // mix the bytes into the state.
@@ -793,7 +794,6 @@ class ABSL_DLL HashState : public HashStateBase<HashState> {
                                         std::integral_constant<int, 8>
                                         /* sizeof_size_t */);
 
-
   // Slow dispatch path for calls to CombineContiguousImpl with a size argument
   // larger than PiecewiseChunkSize().  Has the same effect as calling
   // CombineContiguousImpl() repeatedly with the chunk stride size.
@@ -911,8 +911,8 @@ class ABSL_DLL HashState : public HashStateBase<HashState> {
   uint64_t state_;
 };
 
-// HashState::CombineContiguousImpl()
-inline uint64_t HashState::CombineContiguousImpl(
+// MixingHashState::CombineContiguousImpl()
+inline uint64_t MixingHashState::CombineContiguousImpl(
     uint64_t state, const unsigned char* first, size_t len,
     std::integral_constant<int, 4> /* sizeof_size_t */) {
   // For large values we use CityHash, for small ones we just use a
@@ -934,8 +934,8 @@ inline uint64_t HashState::CombineContiguousImpl(
   return Mix(state, v);
 }
 
-// Overload of HashState::CombineContiguousImpl()
-inline uint64_t HashState::CombineContiguousImpl(
+// Overload of MixingHashState::CombineContiguousImpl()
+inline uint64_t MixingHashState::CombineContiguousImpl(
     uint64_t state, const unsigned char* first, size_t len,
     std::integral_constant<int, 8> /* sizeof_size_t */) {
   // For large values we use Wyhash or CityHash depending on the platform, for
@@ -976,7 +976,9 @@ struct PoisonedHash : private AggregateBarrier {
 
 template <typename T>
 struct HashImpl {
-  size_t operator()(const T& value) const { return HashState::hash(value); }
+  size_t operator()(const T& value) const {
+    return MixingHashState::hash(value);
+  }
 };
 
 template <typename T>
diff --git a/third_party/abseil-cpp/absl/hash/internal/wyhash.h b/third_party/abseil-cpp/absl/hash/internal/wyhash.h
index 4aff4e931a..2b534b4706 100644
--- a/third_party/abseil-cpp/absl/hash/internal/wyhash.h
+++ b/third_party/abseil-cpp/absl/hash/internal/wyhash.h
@@ -36,7 +36,7 @@ namespace hash_internal {
 // integers are hashed into the result.
 //
 // To allow all hashable types (including string_view and Span) to depend on
-// this algoritm, we keep the API low-level, with as few dependencies as
+// this algorithm, we keep the API low-level, with as few dependencies as
 // possible.
 uint64_t Wyhash(const void* data, size_t len, uint64_t seed,
                 const uint64_t salt[5]);
diff --git a/third_party/abseil-cpp/absl/memory/CMakeLists.txt b/third_party/abseil-cpp/absl/memory/CMakeLists.txt
index 78fb7e1b31..9d50e1dcd4 100644
--- a/third_party/abseil-cpp/absl/memory/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/memory/CMakeLists.txt
@@ -37,7 +37,7 @@ absl_cc_test(
   DEPS
     absl::memory
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -51,5 +51,5 @@ absl_cc_test(
     absl::memory
     absl::config
     absl::exception_safety_testing
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/memory/memory.h b/third_party/abseil-cpp/absl/memory/memory.h
index 2b5ff623d4..d63326068f 100644
--- a/third_party/abseil-cpp/absl/memory/memory.h
+++ b/third_party/abseil-cpp/absl/memory/memory.h
@@ -420,7 +420,7 @@ struct pointer_traits<T*> {
 //
 // A C++11 compatible implementation of C++17's std::allocator_traits.
 //
-#if __cplusplus >= 201703L
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
 using std::allocator_traits;
 #else  // __cplusplus >= 201703L
 template <typename Alloc>
diff --git a/third_party/abseil-cpp/absl/meta/CMakeLists.txt b/third_party/abseil-cpp/absl/meta/CMakeLists.txt
index 672ead2fd0..9de4bd3751 100644
--- a/third_party/abseil-cpp/absl/meta/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/meta/CMakeLists.txt
@@ -35,7 +35,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 # component target
diff --git a/third_party/abseil-cpp/absl/meta/type_traits.h b/third_party/abseil-cpp/absl/meta/type_traits.h
index d5cb5f3be3..e7c123936d 100644
--- a/third_party/abseil-cpp/absl/meta/type_traits.h
+++ b/third_party/abseil-cpp/absl/meta/type_traits.h
@@ -499,6 +499,27 @@ struct is_trivially_copy_assignable
 #endif  // ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE
 };
 
+#if defined(__cpp_lib_remove_cvref) && __cpp_lib_remove_cvref >= 201711L
+template <typename T>
+using remove_cvref = std::remove_cvref<T>;
+
+template <typename T>
+using remove_cvref_t = typename std::remove_cvref<T>::type;
+#else
+// remove_cvref()
+//
+// C++11 compatible implementation of std::remove_cvref which was added in
+// C++20.
+template <typename T>
+struct remove_cvref {
+  using type =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+};
+
+template <typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+#endif
+
 namespace type_traits_internal {
 // is_trivially_copyable()
 //
@@ -613,7 +634,7 @@ using underlying_type_t = typename std::underlying_type<T>::type;
 
 namespace type_traits_internal {
 
-#if __cplusplus >= 201703L
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
 // std::result_of is deprecated (C++17) or removed (C++20)
 template<typename> struct result_of;
 template<typename F, typename... Args>
diff --git a/third_party/abseil-cpp/absl/meta/type_traits_test.cc b/third_party/abseil-cpp/absl/meta/type_traits_test.cc
index 1aafd0d49a..0ef5b66558 100644
--- a/third_party/abseil-cpp/absl/meta/type_traits_test.cc
+++ b/third_party/abseil-cpp/absl/meta/type_traits_test.cc
@@ -942,6 +942,34 @@ TEST(TypeTraitsTest, TestTriviallyCopyable) {
       absl::type_traits_internal::is_trivially_copyable<Trivial&>::value);
 }
 
+TEST(TypeTraitsTest, TestRemoveCVRef) {
+  EXPECT_TRUE(
+      (std::is_same<typename absl::remove_cvref<int>::type, int>::value));
+  EXPECT_TRUE(
+      (std::is_same<typename absl::remove_cvref<int&>::type, int>::value));
+  EXPECT_TRUE(
+      (std::is_same<typename absl::remove_cvref<int&&>::type, int>::value));
+  EXPECT_TRUE((
+      std::is_same<typename absl::remove_cvref<const int&>::type, int>::value));
+  EXPECT_TRUE(
+      (std::is_same<typename absl::remove_cvref<int*>::type, int*>::value));
+  // Does not remove const in this case.
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int*>::type,
+                            const int*>::value));
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<int[2]>::type,
+                            int[2]>::value));
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<int(&)[2]>::type,
+                            int[2]>::value));
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<int(&&)[2]>::type,
+                            int[2]>::value));
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int[2]>::type,
+                            int[2]>::value));
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int(&)[2]>::type,
+                            int[2]>::value));
+  EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int(&&)[2]>::type,
+                            int[2]>::value));
+}
+
 #define ABSL_INTERNAL_EXPECT_ALIAS_EQUIVALENCE(trait_name, ...)          \
   EXPECT_TRUE((std::is_same<typename std::trait_name<__VA_ARGS__>::type, \
                             absl::trait_name##_t<__VA_ARGS__>>::value))
diff --git a/third_party/abseil-cpp/absl/numeric/CMakeLists.txt b/third_party/abseil-cpp/absl/numeric/CMakeLists.txt
index 781987dc88..26df5cf703 100644
--- a/third_party/abseil-cpp/absl/numeric/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/numeric/CMakeLists.txt
@@ -38,7 +38,7 @@ absl_cc_test(
     absl::bits
     absl::core_headers
     absl::random_random
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -73,7 +73,7 @@ absl_cc_test(
     absl::core_headers
     absl::hash_testing
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 # component target
diff --git a/third_party/abseil-cpp/absl/numeric/int128.h b/third_party/abseil-cpp/absl/numeric/int128.h
index 0dd814a890..198aa19504 100644
--- a/third_party/abseil-cpp/absl/numeric/int128.h
+++ b/third_party/abseil-cpp/absl/numeric/int128.h
@@ -810,6 +810,14 @@ inline bool operator>=(uint128 lhs, uint128 rhs) { return !(lhs < rhs); }
 
 // Unary operators.
 
+constexpr inline uint128 operator+(uint128 val) {
+  return val;
+}
+
+constexpr inline int128 operator+(int128 val) {
+  return val;
+}
+
 inline uint128 operator-(uint128 val) {
   uint64_t hi = ~Uint128High64(val);
   uint64_t lo = ~Uint128Low64(val) + 1;
@@ -817,27 +825,27 @@ inline uint128 operator-(uint128 val) {
   return MakeUint128(hi, lo);
 }
 
-inline bool operator!(uint128 val) {
+constexpr inline bool operator!(uint128 val) {
   return !Uint128High64(val) && !Uint128Low64(val);
 }
 
 // Logical operators.
 
-inline uint128 operator~(uint128 val) {
+constexpr inline uint128 operator~(uint128 val) {
   return MakeUint128(~Uint128High64(val), ~Uint128Low64(val));
 }
 
-inline uint128 operator|(uint128 lhs, uint128 rhs) {
+constexpr inline uint128 operator|(uint128 lhs, uint128 rhs) {
   return MakeUint128(Uint128High64(lhs) | Uint128High64(rhs),
                            Uint128Low64(lhs) | Uint128Low64(rhs));
 }
 
-inline uint128 operator&(uint128 lhs, uint128 rhs) {
+constexpr inline uint128 operator&(uint128 lhs, uint128 rhs) {
   return MakeUint128(Uint128High64(lhs) & Uint128High64(rhs),
                            Uint128Low64(lhs) & Uint128Low64(rhs));
 }
 
-inline uint128 operator^(uint128 lhs, uint128 rhs) {
+constexpr inline uint128 operator^(uint128 lhs, uint128 rhs) {
   return MakeUint128(Uint128High64(lhs) ^ Uint128High64(rhs),
                            Uint128Low64(lhs) ^ Uint128Low64(rhs));
 }
diff --git a/third_party/abseil-cpp/absl/numeric/int128_test.cc b/third_party/abseil-cpp/absl/numeric/int128_test.cc
index bc86c714ac..c445d89a99 100644
--- a/third_party/abseil-cpp/absl/numeric/int128_test.cc
+++ b/third_party/abseil-cpp/absl/numeric/int128_test.cc
@@ -226,6 +226,11 @@ TEST(Uint128, AllTests) {
   EXPECT_EQ(test >>= 1, one);
   EXPECT_EQ(test <<= 1, two);
 
+  EXPECT_EQ(big, +big);
+  EXPECT_EQ(two, +two);
+  EXPECT_EQ(absl::Uint128Max(), +absl::Uint128Max());
+  EXPECT_EQ(zero, +zero);
+
   EXPECT_EQ(big, -(-big));
   EXPECT_EQ(two, -((-one) - 1));
   EXPECT_EQ(absl::Uint128Max(), -one);
@@ -769,6 +774,19 @@ TEST(Int128, ComparisonTest) {
   }
 }
 
+TEST(Int128, UnaryPlusTest) {
+  int64_t values64[] = {0, 1, 12345, 0x4000000000000000,
+                        std::numeric_limits<int64_t>::max()};
+  for (int64_t value : values64) {
+    SCOPED_TRACE(::testing::Message() << "value = " << value);
+
+    EXPECT_EQ(absl::int128(value), +absl::int128(value));
+    EXPECT_EQ(absl::int128(-value), +absl::int128(-value));
+    EXPECT_EQ(absl::MakeInt128(value, 0), +absl::MakeInt128(value, 0));
+    EXPECT_EQ(absl::MakeInt128(-value, 0), +absl::MakeInt128(-value, 0));
+  }
+}
+
 TEST(Int128, UnaryNegationTest) {
   int64_t values64[] = {0, 1, 12345, 0x4000000000000000,
                         std::numeric_limits<int64_t>::max()};
diff --git a/third_party/abseil-cpp/absl/random/CMakeLists.txt b/third_party/abseil-cpp/absl/random/CMakeLists.txt
index 3009a0348a..9d1c67fb33 100644
--- a/third_party/abseil-cpp/absl/random/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/random/CMakeLists.txt
@@ -62,8 +62,8 @@ absl_cc_test(
     absl::random_random
     absl::random_internal_sequence_urbg
     absl::fast_type_id
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -119,8 +119,8 @@ absl_cc_library(
     absl::type_traits
     absl::utility
     absl::variant
-    gmock
-    gtest
+    GTest::gmock
+    GTest::gtest
   TESTONLY
 )
 
@@ -136,8 +136,8 @@ absl_cc_test(
   DEPS
     absl::random_mocking_bit_gen
     absl::random_random
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -153,8 +153,8 @@ absl_cc_test(
     absl::random_bit_gen_ref
     absl::random_mocking_bit_gen
     absl::random_random
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_library(
@@ -245,8 +245,8 @@ absl_cc_test(
     absl::random_random
     absl::random_internal_sequence_urbg
     absl::random_internal_pcg_engine
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -268,8 +268,8 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::str_format
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -285,8 +285,8 @@ absl_cc_test(
     absl::random_distributions
     absl::random_random
     absl::random_internal_distribution_test_util
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -301,8 +301,8 @@ absl_cc_test(
     absl::random_distributions
     absl::random_random
     absl::raw_logging_internal
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -322,8 +322,8 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::str_format
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -343,8 +343,8 @@ absl_cc_test(
     absl::random_random
     absl::raw_logging_internal
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -367,8 +367,8 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::str_format
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -391,8 +391,8 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::str_format
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -414,8 +414,8 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::str_format
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -435,8 +435,8 @@ absl_cc_test(
     absl::random_random
     absl::raw_logging_internal
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -456,8 +456,8 @@ absl_cc_test(
     absl::random_internal_sequence_urbg
     absl::random_random
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -477,8 +477,8 @@ absl_cc_test(
     absl::random_random
     absl::raw_logging_internal
     absl::strings
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -492,7 +492,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_random
-    gtest_main
+    GTest::gtest_main
 )
 
 absl_cc_test(
@@ -508,8 +508,8 @@ absl_cc_test(
     absl::random_seed_sequences
     absl::random_internal_nonsecure_base
     absl::random_random
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -894,7 +894,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_traits
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -911,7 +911,7 @@ absl_cc_test(
     absl::bits
     absl::flags
     absl::random_internal_generate_real
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -926,7 +926,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_distribution_test_util
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -941,7 +941,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_fastmath
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -957,8 +957,8 @@ absl_cc_test(
   DEPS
     absl::random_internal_explicit_seed_seq
     absl::random_seed_sequences
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -973,8 +973,8 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_salted_seed_seq
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -990,7 +990,7 @@ absl_cc_test(
   DEPS
     absl::core_headers
     absl::random_internal_distribution_test_util
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1005,7 +1005,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_fast_uniform_bits
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1024,7 +1024,7 @@ absl_cc_test(
     absl::random_distributions
     absl::random_seed_sequences
     absl::strings
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1039,8 +1039,8 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_seed_material
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1057,7 +1057,7 @@ absl_cc_test(
     absl::random_internal_pool_urbg
     absl::span
     absl::type_traits
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1074,8 +1074,8 @@ absl_cc_test(
     absl::random_internal_explicit_seed_seq
     absl::random_internal_pcg_engine
     absl::time
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1094,8 +1094,8 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::time
-    gmock
-    gtest_main
+    GTest::gmock
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1111,7 +1111,7 @@ absl_cc_test(
   DEPS
     absl::random_internal_randen
     absl::type_traits
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1127,7 +1127,7 @@ absl_cc_test(
   DEPS
     absl::endian
     absl::random_internal_randen_slow
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1146,8 +1146,8 @@ absl_cc_test(
     absl::random_internal_randen_hwaes_impl
     absl::raw_logging_internal
     absl::str_format
-    gmock
-    gtest
+    GTest::gmock
+    GTest::gtest
 )
 
 # Internal-only target, do not depend on directly.
@@ -1178,7 +1178,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_uniform_helper
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1193,7 +1193,7 @@ absl_cc_test(
     ${ABSL_DEFAULT_LINKOPTS}
   DEPS
     absl::random_internal_iostream_state_saver
-    gtest_main
+    GTest::gtest_main
 )
 
 # Internal-only target, do not depend on directly.
@@ -1210,5 +1210,5 @@ absl_cc_test(
     absl::random_internal_wide_multiply
     absl::bits
     absl::int128
-    gtest_main
+    GTest::gtest_main
 )
diff --git a/third_party/abseil-cpp/absl/random/beta_distribution_test.cc b/third_party/abseil-cpp/absl/random/beta_distribution_test.cc
index 44cdfdd049..d980c969f7 100644
--- a/third_party/abseil-cpp/absl/random/beta_distribution_test.cc
+++ b/third_party/abseil-cpp/absl/random/beta_distribution_test.cc
@@ -15,6 +15,7 @@
 #include "absl/random/beta_distribution.h"
 
 #include <algorithm>
+#include <cfloat>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -558,6 +559,14 @@ TEST(BetaDistributionTest, StabilityTest) {
 // dependencies of the distribution change, such as RandU64ToDouble, then this
 // is also likely to change.
 TEST(BetaDistributionTest, AlgorithmBounds) {
+#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0
+  // We're using an x87-compatible FPU, and intermediate operations are
+  // performed with 80-bit floats. This produces slightly different results from
+  // what we expect below.
+  GTEST_SKIP()
+      << "Skipping the test because we detected x87 floating-point semantics";
+#endif
+
   {
     absl::random_internal::sequence_urbg urbg(
         {0x7fbe76c8b4395800ull, 0x8000000000000000ull});
diff --git a/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc b/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc
index 6d007006ef..415b14cc76 100644
--- a/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc
+++ b/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc
@@ -99,6 +99,7 @@ TYPED_TEST(DiscreteDistributionTypeTest, Constructor) {
 }
 
 TEST(DiscreteDistributionTest, InitDiscreteDistribution) {
+  using testing::_;
   using testing::Pair;
 
   {
@@ -111,8 +112,8 @@ TEST(DiscreteDistributionTest, InitDiscreteDistribution) {
     // Each bucket is p=1/3, so bucket 0 will send half it's traffic
     // to bucket 2, while the rest will retain all of their traffic.
     EXPECT_THAT(q, testing::ElementsAre(Pair(0.5, 2),  //
-                                        Pair(1.0, 1),  //
-                                        Pair(1.0, 2)));
+                                        Pair(1.0, _),  //
+                                        Pair(1.0, _)));
   }
 
   {
@@ -135,7 +136,7 @@ TEST(DiscreteDistributionTest, InitDiscreteDistribution) {
 
     EXPECT_THAT(q, testing::ElementsAre(Pair(b0, 3),   //
                                         Pair(b1, 3),   //
-                                        Pair(1.0, 2),  //
+                                        Pair(1.0, _),  //
                                         Pair(b3, 2),   //
                                         Pair(b1, 3)));
   }
diff --git a/third_party/abseil-cpp/absl/random/distributions_test.cc b/third_party/abseil-cpp/absl/random/distributions_test.cc
index 5866a07257..d3a5dd75e5 100644
--- a/third_party/abseil-cpp/absl/random/distributions_test.cc
+++ b/third_party/abseil-cpp/absl/random/distributions_test.cc
@@ -14,6 +14,7 @@
 
 #include "absl/random/distributions.h"
 
+#include <cfloat>
 #include <cmath>
 #include <cstdint>
 #include <random>
@@ -224,6 +225,15 @@ TEST_F(RandomDistributionsTest, UniformNoBounds) {
 TEST_F(RandomDistributionsTest, UniformNonsenseRanges) {
   // The ranges used in this test are undefined behavior.
   // The results are arbitrary and subject to future changes.
+
+#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0
+  // We're using an x87-compatible FPU, and intermediate operations can be
+  // performed with 80-bit floats. This produces slightly different results from
+  // what we expect below.
+  GTEST_SKIP()
+      << "Skipping the test because we detected x87 floating-point semantics";
+#endif
+
   absl::InsecureBitGen gen;
 
   // <uint>
diff --git a/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc b/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc
index af11d61c15..81a5d17bac 100644
--- a/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc
+++ b/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc
@@ -15,6 +15,7 @@
 #include "absl/random/exponential_distribution.h"
 
 #include <algorithm>
+#include <cfloat>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
@@ -384,6 +385,15 @@ TEST(ExponentialDistributionTest, StabilityTest) {
 TEST(ExponentialDistributionTest, AlgorithmBounds) {
   // Relies on absl::uniform_real_distribution, so some of these comments
   // reference that.
+
+#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0
+  // We're using an x87-compatible FPU, and intermediate operations can be
+  // performed with 80-bit floats. This produces slightly different results from
+  // what we expect below.
+  GTEST_SKIP()
+      << "Skipping the test because we detected x87 floating-point semantics";
+#endif
+
   absl::exponential_distribution<double> dist;
 
   {
diff --git a/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc b/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc
index 5bee530770..725100a415 100644
--- a/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc
+++ b/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc
@@ -194,11 +194,10 @@ RandenPoolEntry* PoolAlignedAlloc() {
   // Not all the platforms that we build for have std::aligned_alloc, however
   // since we never free these objects, we can over allocate and munge the
   // pointers to the correct alignment.
-  void* memory = std::malloc(sizeof(RandenPoolEntry) + kAlignment);
-  auto x = reinterpret_cast<intptr_t>(memory);
+  intptr_t x = reinterpret_cast<intptr_t>(
+      new char[sizeof(RandenPoolEntry) + kAlignment]);
   auto y = x % kAlignment;
-  void* aligned =
-      (y == 0) ? memory : reinterpret_cast<void*>(x + kAlignment - y);
+  void* aligned = reinterpret_cast<void*>(y == 0 ? x : (x + kAlignment - y));
   return new (aligned) RandenPoolEntry();
 }
 
diff --git a/third_party/abseil-cpp/absl/random/internal/seed_material.cc b/third_party/abseil-cpp/absl/random/internal/seed_material.cc
index 4d38a57419..7c1d9efa42 100644
--- a/third_party/abseil-cpp/absl/random/internal/seed_material.cc
+++ b/third_party/abseil-cpp/absl/random/internal/seed_material.cc
@@ -28,6 +28,7 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "absl/base/dynamic_annotations.h"
 #include "absl/base/internal/raw_logging.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
@@ -50,6 +51,12 @@
 
 #endif
 
+#if defined(__GLIBC__) && \
+    (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))
+// glibc >= 2.25 has getentropy()
+#define ABSL_RANDOM_USE_GET_ENTROPY 1
+#endif
+
 #if defined(ABSL_RANDOM_USE_BCRYPT)
 #include <bcrypt.h>
 
@@ -122,8 +129,32 @@ bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) {
 
 #else
 
+#if defined(ABSL_RANDOM_USE_GET_ENTROPY)
+// On *nix, use getentropy() if supported. Note that libc may support
+// getentropy(), but the kernel may not, in which case this function will return
+// false.
+bool ReadSeedMaterialFromGetEntropy(absl::Span<uint32_t> values) {
+  auto buffer = reinterpret_cast<uint8_t*>(values.data());
+  size_t buffer_size = sizeof(uint32_t) * values.size();
+  while (buffer_size > 0) {
+    // getentropy() has a maximum permitted length of 256.
+    size_t to_read = std::min<size_t>(buffer_size, 256);
+    int result = getentropy(buffer, to_read);
+    if (result < 0) {
+      return false;
+    }
+    // https://github.com/google/sanitizers/issues/1173
+    // MemorySanitizer can't see through getentropy().
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buffer, to_read);
+    buffer += to_read;
+    buffer_size -= to_read;
+  }
+  return true;
+}
+#endif  // defined(ABSL_RANDOM_GETENTROPY)
+
 // On *nix, read entropy from /dev/urandom.
-bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) {
+bool ReadSeedMaterialFromDevURandom(absl::Span<uint32_t> values) {
   const char kEntropyFile[] = "/dev/urandom";
 
   auto buffer = reinterpret_cast<uint8_t*>(values.data());
@@ -150,6 +181,17 @@ bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) {
   return success;
 }
 
+bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) {
+#if defined(ABSL_RANDOM_USE_GET_ENTROPY)
+  if (ReadSeedMaterialFromGetEntropy(values)) {
+    return true;
+  }
+#endif
+  // Libc may support getentropy, but the kernel may not, so we still have
+  // to fallback to ReadSeedMaterialFromDevURandom().
+  return ReadSeedMaterialFromDevURandom(values);
+}
+
 #endif
 
 }  // namespace
diff --git a/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc b/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc
index 18bcd3bce8..035bd284d1 100644
--- a/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc
+++ b/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc
@@ -14,6 +14,7 @@
 
 #include "absl/random/uniform_real_distribution.h"
 
+#include <cfloat>
 #include <cmath>
 #include <cstdint>
 #include <iterator>
@@ -70,6 +71,14 @@ using RealTypes =
 TYPED_TEST_SUITE(UniformRealDistributionTest, RealTypes);
 
 TYPED_TEST(UniformRealDistributionTest, ParamSerializeTest) {
+#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0
+  // We're using an x87-compatible FPU, and intermediate operations are
+  // performed with 80-bit floats. This produces slightly different results from
+  // what we expect below.
+  GTEST_SKIP()
+      << "Skipping the test because we detected x87 floating-point semantics";
+#endif
+
   using param_type =
       typename absl::uniform_real_distribution<TypeParam>::param_type;
 
diff --git a/third_party/abseil-cpp/absl/status/CMakeLists.txt b/third_party/abseil-cpp/absl/status/CMakeLists.txt
index f0d798a373..1248dff03e 100644
--- a/third_party/abseil-cpp/absl/status/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/status/CMakeLists.txt
@@ -50,7 +50,7 @@ absl_cc_test(
   DEPS
     absl::status
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -84,5 +84,5 @@ absl_cc_test(
   DEPS
     absl::status
     absl::statusor
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/status/internal/status_internal.h b/third_party/abseil-cpp/absl/status/internal/status_internal.h
index 279f8f55be..ac12940a6d 100644
--- a/third_party/abseil-cpp/absl/status/internal/status_internal.h
+++ b/third_party/abseil-cpp/absl/status/internal/status_internal.h
@@ -19,6 +19,17 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/cord.h"
 
+#ifndef SWIG
+// Disabled for SWIG as it doesn't parse attributes correctly.
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+// Returned Status objects may not be ignored. Codesearch doesn't handle ifdefs
+// as part of a class definitions (b/6995610), so we use a forward declaration.
+class ABSL_MUST_USE_RESULT Status;
+ABSL_NAMESPACE_END
+}  // namespace absl
+#endif  // !SWIG
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 
@@ -36,12 +47,12 @@ using Payloads = absl::InlinedVector<Payload, 1>;
 
 // Reference-counted representation of Status data.
 struct StatusRep {
-  StatusRep(absl::StatusCode code, std::string message,
-            std::unique_ptr<status_internal::Payloads> payloads)
+  StatusRep(absl::StatusCode code_arg, absl::string_view message_arg,
+            std::unique_ptr<status_internal::Payloads> payloads_arg)
       : ref(int32_t{1}),
-        code(code),
-        message(std::move(message)),
-        payloads(std::move(payloads)) {}
+        code(code_arg),
+        message(message_arg),
+        payloads(std::move(payloads_arg)) {}
 
   std::atomic<int32_t> ref;
   absl::StatusCode code;
diff --git a/third_party/abseil-cpp/absl/status/status.cc b/third_party/abseil-cpp/absl/status/status.cc
index 51a0d26897..5a5cd5c239 100644
--- a/third_party/abseil-cpp/absl/status/status.cc
+++ b/third_party/abseil-cpp/absl/status/status.cc
@@ -207,19 +207,10 @@ void Status::UnrefNonInlined(uintptr_t rep) {
   }
 }
 
-uintptr_t Status::NewRep(
-    absl::StatusCode code, absl::string_view msg,
-    std::unique_ptr<status_internal::Payloads> payloads) {
-  status_internal::StatusRep* rep = new status_internal::StatusRep(
-      code, std::string(msg.data(), msg.size()),
-      std::move(payloads));
-  return PointerToRep(rep);
-}
-
 Status::Status(absl::StatusCode code, absl::string_view msg)
     : rep_(CodeToInlinedRep(code)) {
   if (code != absl::StatusCode::kOk && !msg.empty()) {
-    rep_ = NewRep(code, msg, nullptr);
+    rep_ = PointerToRep(new status_internal::StatusRep(code, msg, nullptr));
   }
 }
 
@@ -238,9 +229,9 @@ absl::StatusCode Status::code() const {
 void Status::PrepareToModify() {
   ABSL_RAW_CHECK(!ok(), "PrepareToModify shouldn't be called on OK status.");
   if (IsInlined(rep_)) {
-    rep_ =
-        NewRep(static_cast<absl::StatusCode>(raw_code()), absl::string_view(),
-               nullptr);
+    rep_ = PointerToRep(new status_internal::StatusRep(
+        static_cast<absl::StatusCode>(raw_code()), absl::string_view(),
+        nullptr));
     return;
   }
 
@@ -251,8 +242,9 @@ void Status::PrepareToModify() {
     if (rep->payloads) {
       payloads = absl::make_unique<status_internal::Payloads>(*rep->payloads);
     }
-    rep_ = NewRep(rep->code, message(),
-                  std::move(payloads));
+    status_internal::StatusRep* const new_rep = new status_internal::StatusRep(
+        rep->code, message(), std::move(payloads));
+    rep_ = PointerToRep(new_rep);
     UnrefNonInlined(rep_i);
   }
 }
diff --git a/third_party/abseil-cpp/absl/status/status.h b/third_party/abseil-cpp/absl/status/status.h
index df9e330c00..2e05f46e87 100644
--- a/third_party/abseil-cpp/absl/status/status.h
+++ b/third_party/abseil-cpp/absl/status/status.h
@@ -291,6 +291,10 @@ enum class StatusToStringMode : int {
   kWithNoExtraData = 0,
   // ToString will contain the payloads.
   kWithPayload = 1 << 0,
+  // ToString will include all the extra data this Status has.
+  kWithEverything = ~kWithNoExtraData,
+  // Default mode used by ToString. Its exact value might change in the future.
+  kDefault = kWithPayload,
 };
 
 // absl::StatusToStringMode is specified as a bitmask type, which means the
@@ -410,7 +414,12 @@ inline StatusToStringMode& operator^=(StatusToStringMode& lhs,
 //     return result;
 //   }
 //
-class ABSL_MUST_USE_RESULT Status final {
+// For documentation see https://abseil.io/docs/cpp/guides/status.
+//
+// Returned Status objects may not be ignored. status_internal.h has a forward
+// declaration of the form
+// class ABSL_MUST_USE_RESULT Status;
+class Status final {
  public:
   // Constructors
 
@@ -502,7 +511,7 @@ class ABSL_MUST_USE_RESULT Status final {
   // result, and the payloads to be printed use the status payload printer
   // mechanism (which is internal).
   std::string ToString(
-      StatusToStringMode mode = StatusToStringMode::kWithPayload) const;
+      StatusToStringMode mode = StatusToStringMode::kDefault) const;
 
   // Status::IgnoreError()
   //
diff --git a/third_party/abseil-cpp/absl/status/status_test.cc b/third_party/abseil-cpp/absl/status/status_test.cc
index 7116ba671f..1b038f6d98 100644
--- a/third_party/abseil-cpp/absl/status/status_test.cc
+++ b/third_party/abseil-cpp/absl/status/status_test.cc
@@ -36,7 +36,9 @@ TEST(StatusCode, InsertionOperator) {
 // its creator, and its classifier.
 struct ErrorTest {
   absl::StatusCode code;
-  using Creator = absl::Status (*)(absl::string_view);
+  using Creator = absl::Status (*)(
+      absl::string_view
+  );
   using Classifier = bool (*)(const absl::Status&);
   Creator creator;
   Classifier classifier;
@@ -78,7 +80,9 @@ TEST(Status, CreateAndClassify) {
     // expected error code and message.
     std::string message =
         absl::StrCat("error code ", test.code, " test message");
-    absl::Status status = test.creator(message);
+    absl::Status status = test.creator(
+        message
+    );
     EXPECT_EQ(test.code, status.code());
     EXPECT_EQ(message, status.message());
 
@@ -292,6 +296,10 @@ TEST(Status, ToStringMode) {
               AllOf(HasSubstr("INTERNAL: fail"), HasSubstr("[foo='bar']"),
                     HasSubstr("[bar='\\xff']")));
 
+  EXPECT_THAT(s.ToString(absl::StatusToStringMode::kWithEverything),
+              AllOf(HasSubstr("INTERNAL: fail"), HasSubstr("[foo='bar']"),
+                    HasSubstr("[bar='\\xff']")));
+
   EXPECT_THAT(s.ToString(~absl::StatusToStringMode::kWithPayload),
               AllOf(HasSubstr("INTERNAL: fail"), Not(HasSubstr("[foo='bar']")),
                     Not(HasSubstr("[bar='\\xff']"))));
diff --git a/third_party/abseil-cpp/absl/status/statusor.h b/third_party/abseil-cpp/absl/status/statusor.h
index 469d486fdd..b7c55cc8ac 100644
--- a/third_party/abseil-cpp/absl/status/statusor.h
+++ b/third_party/abseil-cpp/absl/status/statusor.h
@@ -135,7 +135,7 @@ class ABSL_MUST_USE_RESULT StatusOr;
 //
 // NOTE: using `absl::StatusOr<T>::value()` when no valid value is present will
 // throw an exception if exceptions are enabled or terminate the process when
-// execeptions are not enabled.
+// exceptions are not enabled.
 //
 // Example:
 //
diff --git a/third_party/abseil-cpp/absl/strings/CMakeLists.txt b/third_party/abseil-cpp/absl/strings/CMakeLists.txt
index 3b7ae639f5..0246dc3851 100644
--- a/third_party/abseil-cpp/absl/strings/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/strings/CMakeLists.txt
@@ -101,7 +101,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::base
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -115,7 +115,7 @@ absl_cc_test(
     absl::strings
     absl::core_headers
     absl::fixed_array
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -128,7 +128,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -142,7 +142,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -156,7 +156,7 @@ absl_cc_test(
     absl::strings_internal
     absl::base
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -169,7 +169,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -184,7 +184,7 @@ absl_cc_test(
     absl::config
     absl::core_headers
     absl::dynamic_annotations
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -197,7 +197,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -209,7 +209,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -221,12 +221,12 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::strings
-    absl::base
     absl::core_headers
     absl::dynamic_annotations
+    absl::btree
     absl::flat_hash_map
     absl::node_hash_map
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -238,7 +238,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::strings_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -253,7 +253,7 @@ absl_cc_test(
     absl::base
     absl::core_headers
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -268,7 +268,7 @@ absl_cc_test(
     absl::base
     absl::core_headers
     absl::memory
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -281,7 +281,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -301,7 +301,7 @@ absl_cc_test(
     absl::random_random
     absl::random_distributions
     absl::strings_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -314,7 +314,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::base
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -326,7 +326,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::strings_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -340,7 +340,7 @@ absl_cc_test(
     absl::strings
     absl::str_format
     absl::pow10_helper
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -355,7 +355,7 @@ absl_cc_test(
     absl::strings
     absl::config
     absl::raw_logging_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -370,7 +370,7 @@ absl_cc_test(
   DEPS
     absl::strings
     absl::config
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -428,7 +428,7 @@ absl_cc_test(
     absl::cord
     absl::strings
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -442,7 +442,7 @@ absl_cc_test(
     absl::str_format
     absl::str_format_internal
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -455,7 +455,7 @@ absl_cc_test(
   DEPS
     absl::str_format
     absl::str_format_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -467,7 +467,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::str_format_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -479,7 +479,7 @@ absl_cc_test(
     ${ABSL_TEST_COPTS}
   DEPS
     absl::str_format
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -494,7 +494,7 @@ absl_cc_test(
     absl::str_format_internal
     absl::raw_logging_internal
     absl::int128
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -507,7 +507,7 @@ absl_cc_test(
   DEPS
     absl::str_format_internal
     absl::cord
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -520,7 +520,7 @@ absl_cc_test(
   DEPS
     absl::str_format_internal
     absl::core_headers
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -547,39 +547,300 @@ absl_cc_test(
   DEPS
     absl::pow10_helper
     absl::str_format
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
   NAME
-    cord
+    cord_internal
   HDRS
-    "cord.h"
-  SRCS
-    "cord.cc"
-    "internal/cord_internal.cc"
     "internal/cord_internal.h"
+    "internal/cord_rep_flat.h"
     "internal/cord_rep_ring.h"
-    "internal/cord_rep_ring.cc"
     "internal/cord_rep_ring_reader.h"
-    "internal/cord_rep_flat.h"
+  SRCS
+    "internal/cord_internal.cc"
+    "internal/cord_rep_ring.cc"
   COPTS
     ${ABSL_DEFAULT_COPTS}
   DEPS
-    absl::base
     absl::base_internal
     absl::compressed_tuple
     absl::config
     absl::core_headers
     absl::endian
+    absl::inlined_vector
+    absl::layout
+    absl::raw_logging_internal
+    absl::strings
+    absl::throw_delegate
+    absl::type_traits
+)
+
+absl_cc_library(
+  NAME
+    cordz_update_tracker
+  HDRS
+    "internal/cordz_update_tracker.h"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::config
+)
+
+absl_cc_test(
+  NAME
+    cordz_update_tracker_test
+  SRCS
+    "internal/cordz_update_tracker_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::config
+    absl::cordz_update_tracker
+    absl::core_headers
+    absl::synchronization
+    GTest::gmock_main
+)
+
+absl_cc_library(
+  NAME
+    cordz_functions
+  HDRS
+    "internal/cordz_functions.h"
+  SRCS
+    "internal/cordz_functions.cc"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::config
+    absl::core_headers
+    absl::exponential_biased
+    absl::raw_logging_internal
+)
+
+absl_cc_test(
+  NAME
+    cordz_functions_test
+  SRCS
+    "internal/cordz_functions_test.cc"
+  DEPS
+    absl::config
+    absl::cordz_functions
+    absl::cordz_test_helpers
+    GTest::gmock_main
+)
+
+absl_cc_library(
+  NAME
+    cordz_statistics
+  HDRS
+    "internal/cordz_statistics.h"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::config
+    absl::core_headers
+    absl::cordz_update_tracker
+    absl::synchronization
+)
+
+absl_cc_library(
+  NAME
+    cordz_handle
+  HDRS
+    "internal/cordz_handle.h"
+  SRCS
+    "internal/cordz_handle.cc"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::base
+    absl::config
+    absl::raw_logging_internal
+    absl::synchronization
+)
+
+absl_cc_test(
+  NAME
+    cordz_handle_test
+  SRCS
+    "internal/cordz_handle_test.cc"
+  DEPS
+    absl::config
+    absl::cordz_handle
+    absl::cordz_test_helpers
+    absl::memory
+    absl::random_random
+    absl::random_distributions
+    absl::synchronization
+    absl::time
+    GTest::gmock_main
+)
+
+absl_cc_library(
+  NAME
+    cordz_info
+  HDRS
+    "internal/cordz_info.h"
+  SRCS
+    "internal/cordz_info.cc"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::base
+    absl::config
+    absl::cord_internal
+    absl::cordz_functions
+    absl::cordz_handle
+    absl::cordz_statistics
+    absl::cordz_update_tracker
+    absl::core_headers
+    absl::inlined_vector
+    absl::span
+    absl::raw_logging_internal
+    absl::stacktrace
+    absl::synchronization
+)
+
+absl_cc_test(
+  NAME
+    cordz_info_test
+  SRCS
+    "internal/cordz_info_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::config
+    absl::cord_internal
+    absl::cordz_test_helpers
+    absl::cordz_handle
+    absl::cordz_info
+    absl::cordz_statistics
+    absl::cordz_test_helpers
+    absl::cordz_update_tracker
+    absl::span
+    absl::stacktrace
+    absl::symbolize
+    GTest::gmock_main
+)
+
+absl_cc_test(
+  NAME
+    cordz_info_statistics_test
+  SRCS
+    "internal/cordz_info_statistics_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::config
+    absl::cord
+    absl::cord_internal
+    absl::cordz_info
+    absl::cordz_sample_token
+    absl::cordz_statistics
+    absl::cordz_update_scope
+    absl::cordz_update_tracker
+    absl::thread_pool
+    GTest::gmock_main
+)
+
+absl_cc_library(
+  NAME
+    cordz_sample_token
+  HDRS
+    "internal/cordz_sample_token.h"
+  SRCS
+    "internal/cordz_sample_token.cc"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::config
+    absl::cordz_handle
+    absl::cordz_info
+)
+
+absl_cc_test(
+  NAME
+    cordz_sample_token_test
+  SRCS
+    "internal/cordz_sample_token_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::config
+    absl::cord_internal
+    absl::cordz_handle
+    absl::cordz_info
+    absl::cordz_info
+    absl::cordz_sample_token
+    absl::cordz_test_helpers
+    absl::memory
+    absl::random_random
+    absl::synchronization
+    absl::thread_pool
+    absl::time
+    GTest::gmock_main
+)
+
+absl_cc_library(
+  NAME
+    cordz_update_scope
+  HDRS
+    "internal/cordz_update_scope.h"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::config
+    absl::cord_internal
+    absl::cordz_info
+    absl::cordz_update_tracker
+    absl::core_headers
+)
+
+absl_cc_test(
+  NAME
+    cordz_update_scope_test
+  SRCS
+    "internal/cordz_update_scope_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::config
+    absl::cord_internal
+    absl::cordz_info
+    absl::cordz_test_helpers
+    absl::cordz_update_scope
+    absl::cordz_update_tracker
+    absl::core_headers
+    GTest::gmock_main
+)
+
+absl_cc_library(
+  NAME
+    cord
+  HDRS
+    "cord.h"
+  SRCS
+    "cord.cc"
+  COPTS
+    ${ABSL_DEFAULT_COPTS}
+  DEPS
+    absl::base
+    absl::config
+    absl::cord_internal
+    absl::cordz_functions
+    absl::cordz_info
+    absl::cordz_update_scope
+    absl::cordz_update_tracker
+    absl::core_headers
+    absl::endian
     absl::fixed_array
     absl::function_ref
     absl::inlined_vector
     absl::optional
     absl::raw_logging_internal
     absl::strings
-    absl::strings_internal
-    absl::throw_delegate
     absl::type_traits
   PUBLIC
 )
@@ -592,7 +853,30 @@ absl_cc_library(
   COPTS
     ${ABSL_TEST_COPTS}
   DEPS
+    absl::config
+    absl::cord
+    absl::cord_internal
+    absl::strings
+  TESTONLY
+)
+
+absl_cc_library(
+  NAME
+    cordz_test_helpers
+  HDRS
+    "cordz_test_helpers.h"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::config
     absl::cord
+    absl::cord_internal
+    absl::cordz_info
+    absl::cordz_sample_token
+    absl::cordz_statistics
+    absl::cordz_update_tracker
+    absl::core_headers
+    absl::strings
   TESTONLY
 )
 
@@ -609,11 +893,13 @@ absl_cc_test(
     absl::strings
     absl::base
     absl::config
+    absl::cord_test_helpers
+    absl::cordz_test_helpers
     absl::core_headers
     absl::endian
     absl::raw_logging_internal
     absl::fixed_array
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -624,13 +910,13 @@ absl_cc_test(
   COPTS
     ${ABSL_TEST_COPTS}
   DEPS
-    absl::config
-    absl::cord
-    absl::strings
     absl::base
+    absl::config
+    absl::cord_internal
     absl::core_headers
     absl::raw_logging_internal
-    gmock_main
+    absl::strings
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -641,9 +927,33 @@ absl_cc_test(
   COPTS
     ${ABSL_TEST_COPTS}
   DEPS
-    absl::cord
+    absl::base
+    absl::cord_internal
+    absl::core_headers
     absl::strings
+    GTest::gmock_main
+)
+
+absl_cc_test(
+  NAME
+    cordz_test
+  SRCS
+    "cordz_test.cc"
+  COPTS
+    ${ABSL_TEST_COPTS}
+  DEPS
+    absl::cord
+    absl::cord_test_helpers
+    absl::cordz_test_helpers
+    absl::cordz_functions
+    absl::cordz_info
+    absl::cordz_sample_token
+    absl::cordz_statistics
+    absl::cordz_update_tracker
     absl::base
+    absl::config
     absl::core_headers
-    gmock_main
+    absl::raw_logging_internal
+    absl::strings
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/strings/charconv.cc b/third_party/abseil-cpp/absl/strings/charconv.cc
index b8674c2802..fefcfc90a5 100644
--- a/third_party/abseil-cpp/absl/strings/charconv.cc
+++ b/third_party/abseil-cpp/absl/strings/charconv.cc
@@ -111,7 +111,7 @@ struct FloatTraits<double> {
     return sign ? -ldexp(mantissa, exponent) : ldexp(mantissa, exponent);
 #else
     constexpr uint64_t kMantissaMask =
-        (uint64_t(1) << (kTargetMantissaBits - 1)) - 1;
+        (uint64_t{1} << (kTargetMantissaBits - 1)) - 1;
     uint64_t dbl = static_cast<uint64_t>(sign) << 63;
     if (mantissa > kMantissaMask) {
       // Normal value.
@@ -151,7 +151,7 @@ struct FloatTraits<float> {
     return sign ? -ldexpf(mantissa, exponent) : ldexpf(mantissa, exponent);
 #else
     constexpr uint32_t kMantissaMask =
-        (uint32_t(1) << (kTargetMantissaBits - 1)) - 1;
+        (uint32_t{1} << (kTargetMantissaBits - 1)) - 1;
     uint32_t flt = static_cast<uint32_t>(sign) << 31;
     if (mantissa > kMantissaMask) {
       // Normal value.
@@ -499,7 +499,7 @@ bool MustRoundUp(uint64_t guess_mantissa, int guess_exponent,
 template <typename FloatType>
 CalculatedFloat CalculatedFloatFromRawValues(uint64_t mantissa, int exponent) {
   CalculatedFloat result;
-  if (mantissa == uint64_t(1) << FloatTraits<FloatType>::kTargetMantissaBits) {
+  if (mantissa == uint64_t{1} << FloatTraits<FloatType>::kTargetMantissaBits) {
     mantissa >>= 1;
     exponent += 1;
   }
diff --git a/third_party/abseil-cpp/absl/strings/cord.cc b/third_party/abseil-cpp/absl/strings/cord.cc
index 93533757f5..f5aa6e4788 100644
--- a/third_party/abseil-cpp/absl/strings/cord.cc
+++ b/third_party/abseil-cpp/absl/strings/cord.cc
@@ -38,6 +38,9 @@
 #include "absl/strings/internal/cord_internal.h"
 #include "absl/strings/internal/cord_rep_flat.h"
 #include "absl/strings/internal/cord_rep_ring.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_scope.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
 #include "absl/strings/internal/resize_uninitialized.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -53,8 +56,10 @@ using ::absl::cord_internal::CordRepExternal;
 using ::absl::cord_internal::CordRepFlat;
 using ::absl::cord_internal::CordRepRing;
 using ::absl::cord_internal::CordRepSubstring;
-using ::absl::cord_internal::kMinFlatLength;
+using ::absl::cord_internal::CordzUpdateTracker;
+using ::absl::cord_internal::InlineData;
 using ::absl::cord_internal::kMaxFlatLength;
+using ::absl::cord_internal::kMinFlatLength;
 
 using ::absl::cord_internal::CONCAT;
 using ::absl::cord_internal::EXTERNAL;
@@ -206,7 +211,7 @@ static CordRep* MakeBalancedTree(CordRep** reps, size_t n) {
 }
 
 static CordRepFlat* CreateFlat(const char* data, size_t length,
-                            size_t alloc_hint) {
+                               size_t alloc_hint) {
   CordRepFlat* flat = CordRepFlat::New(length + alloc_hint);
   flat->length = length;
   memcpy(flat->Data(), data, length);
@@ -230,9 +235,7 @@ static CordRep* RingNewTree(const char* data, size_t length,
 
 // Create a new tree out of the specified array.
 // The returned node has a refcount of 1.
-static CordRep* NewTree(const char* data,
-                        size_t length,
-                        size_t alloc_hint) {
+static CordRep* NewTree(const char* data, size_t length, size_t alloc_hint) {
   if (length == 0) return nullptr;
   if (cord_ring_enabled()) {
     return RingNewTree(data, length, alloc_hint);
@@ -279,6 +282,35 @@ static CordRep* NewSubstring(CordRep* child, size_t offset, size_t length) {
   }
 }
 
+// Creates a CordRep from the provided string. If the string is large enough,
+// and not wasteful, we move the string into an external cord rep, preserving
+// the already allocated string contents.
+// Requires the provided string length to be larger than `kMaxInline`.
+static CordRep* CordRepFromString(std::string&& src) {
+  assert(src.length() > cord_internal::kMaxInline);
+  if (
+      // String is short: copy data to avoid external block overhead.
+      src.size() <= kMaxBytesToCopy ||
+      // String is wasteful: copy data to avoid pinning too much unused memory.
+      src.size() < src.capacity() / 2
+  ) {
+    return NewTree(src.data(), src.size(), 0);
+  }
+
+  struct StringReleaser {
+    void operator()(absl::string_view /* data */) {}
+    std::string data;
+  };
+  const absl::string_view original_data = src;
+  auto* rep =
+      static_cast<::absl::cord_internal::CordRepExternalImpl<StringReleaser>*>(
+          absl::cord_internal::NewExternalRep(original_data,
+                                              StringReleaser{std::move(src)}));
+  // Moving src may have invalidated its data pointer, so adjust it.
+  rep->base = rep->template get<0>().data.data();
+  return rep;
+}
+
 // --------------------------------------------------------------------
 // Cord::InlineRep functions
 
@@ -299,20 +331,6 @@ inline char* Cord::InlineRep::set_data(size_t n) {
   return data_.as_chars();
 }
 
-inline CordRep* Cord::InlineRep::force_tree(size_t extra_hint) {
-  if (data_.is_tree()) {
-    return data_.as_tree();
-  }
-
-  size_t len = inline_size();
-  CordRepFlat* result = CordRepFlat::New(len + extra_hint);
-  result->length = len;
-  static_assert(kMinFlatLength >= sizeof(data_), "");
-  memcpy(result->Data(), data_.as_chars(), sizeof(data_));
-  set_tree(result);
-  return result;
-}
-
 inline void Cord::InlineRep::reduce_size(size_t n) {
   size_t tag = inline_size();
   assert(tag <= kMaxInline);
@@ -334,25 +352,72 @@ static CordRepRing* ForceRing(CordRep* rep, size_t extra) {
   return (rep->tag == RING) ? rep->ring() : CordRepRing::Create(rep, extra);
 }
 
-void Cord::InlineRep::AppendTree(CordRep* tree) {
+void Cord::InlineRep::AppendTreeToInlined(CordRep* tree,
+                                          MethodIdentifier method) {
+  assert(!is_tree());
+  if (!data_.is_empty()) {
+    CordRepFlat* flat = MakeFlatWithExtraCapacity(0);
+    if (cord_ring_enabled()) {
+      tree = CordRepRing::Append(CordRepRing::Create(flat, 1), tree);
+    } else {
+      tree = Concat(flat, tree);
+    }
+  }
+  EmplaceTree(tree, method);
+}
+
+void Cord::InlineRep::AppendTreeToTree(CordRep* tree, MethodIdentifier method) {
+  assert(is_tree());
+  const CordzUpdateScope scope(data_.cordz_info(), method);
+  if (cord_ring_enabled()) {
+    tree = CordRepRing::Append(ForceRing(data_.as_tree(), 1), tree);
+  } else {
+    tree = Concat(data_.as_tree(), tree);
+  }
+  SetTree(tree, scope);
+}
+
+void Cord::InlineRep::AppendTree(CordRep* tree, MethodIdentifier method) {
   if (tree == nullptr) return;
-  if (data_.is_empty()) {
-    set_tree(tree);
-  } else if (cord_ring_enabled()) {
-    set_tree(CordRepRing::Append(ForceRing(force_tree(0), 1), tree));
+  if (data_.is_tree()) {
+    AppendTreeToTree(tree, method);
   } else {
-    set_tree(Concat(force_tree(0), tree));
+    AppendTreeToInlined(tree, method);
   }
 }
 
-void Cord::InlineRep::PrependTree(CordRep* tree) {
+void Cord::InlineRep::PrependTreeToInlined(CordRep* tree,
+                                           MethodIdentifier method) {
+  assert(!is_tree());
+  if (!data_.is_empty()) {
+    CordRepFlat* flat = MakeFlatWithExtraCapacity(0);
+    if (cord_ring_enabled()) {
+      tree = CordRepRing::Prepend(CordRepRing::Create(flat, 1), tree);
+    } else {
+      tree = Concat(tree, flat);
+    }
+  }
+  EmplaceTree(tree, method);
+}
+
+void Cord::InlineRep::PrependTreeToTree(CordRep* tree,
+                                        MethodIdentifier method) {
+  assert(is_tree());
+  const CordzUpdateScope scope(data_.cordz_info(), method);
+  if (cord_ring_enabled()) {
+    tree = CordRepRing::Prepend(ForceRing(data_.as_tree(), 1), tree);
+  } else {
+    tree = Concat(tree, data_.as_tree());
+  }
+  SetTree(tree, scope);
+}
+
+void Cord::InlineRep::PrependTree(CordRep* tree, MethodIdentifier method) {
   assert(tree != nullptr);
-  if (data_.is_empty()) {
-    set_tree(tree);
-  } else if (cord_ring_enabled()) {
-    set_tree(CordRepRing::Prepend(ForceRing(force_tree(0), 1), tree));
+  if (data_.is_tree()) {
+    PrependTreeToTree(tree, method);
   } else {
-    set_tree(Concat(tree, force_tree(0)));
+    PrependTreeToInlined(tree, method);
   }
 }
 
@@ -404,76 +469,43 @@ static inline bool PrepareAppendRegion(CordRep* root, char** region,
   return true;
 }
 
+template <bool has_length>
 void Cord::InlineRep::GetAppendRegion(char** region, size_t* size,
-                                      size_t max_length) {
-  if (max_length == 0) {
-    *region = nullptr;
-    *size = 0;
-    return;
-  }
-
-  // Try to fit in the inline buffer if possible.
-  if (!is_tree()) {
-    size_t inline_length = inline_size();
-    if (max_length <= kMaxInline - inline_length) {
-      *region = data_.as_chars() + inline_length;
-      *size = max_length;
-      set_inline_size(inline_length + max_length);
-      return;
-    }
-  }
-
-  CordRep* root = force_tree(max_length);
-
-  if (PrepareAppendRegion(root, region, size, max_length)) {
-    return;
-  }
-
-  // Allocate new node.
-  CordRepFlat* new_node =
-      CordRepFlat::New(std::max(static_cast<size_t>(root->length), max_length));
-  new_node->length = std::min(new_node->Capacity(), max_length);
-  *region = new_node->Data();
-  *size = new_node->length;
-
-  if (cord_ring_enabled()) {
-    replace_tree(CordRepRing::Append(ForceRing(root, 1), new_node));
-    return;
-  }
-  replace_tree(Concat(root, new_node));
-}
-
-void Cord::InlineRep::GetAppendRegion(char** region, size_t* size) {
-  const size_t max_length = std::numeric_limits<size_t>::max();
-
-  // Try to fit in the inline buffer if possible.
-  if (!data_.is_tree()) {
-    size_t inline_length = inline_size();
-    if (inline_length < kMaxInline) {
-      *region = data_.as_chars() + inline_length;
-      *size = kMaxInline - inline_length;
-      set_inline_size(kMaxInline);
+                                      size_t length) {
+  auto constexpr method = CordzUpdateTracker::kGetAppendRegion;
+
+  CordRep* root = tree();
+  size_t sz = root ? root->length : inline_size();
+  if (root == nullptr) {
+    size_t available = kMaxInline - sz;
+    if (available >= (has_length ? length : 1)) {
+      *region = data_.as_chars() + sz;
+      *size = has_length ? length : available;
+      set_inline_size(has_length ? sz + length : kMaxInline);
       return;
     }
   }
 
-  CordRep* root = force_tree(max_length);
-
-  if (PrepareAppendRegion(root, region, size, max_length)) {
+  size_t extra = has_length ? length : (std::max)(sz, kMinFlatLength);
+  CordRep* rep = root ? root : MakeFlatWithExtraCapacity(extra);
+  CordzUpdateScope scope(root ? data_.cordz_info() : nullptr, method);
+  if (PrepareAppendRegion(rep, region, size, length)) {
+    CommitTree(root, rep, scope, method);
     return;
   }
 
   // Allocate new node.
-  CordRepFlat* new_node = CordRepFlat::New(root->length);
-  new_node->length = new_node->Capacity();
+  CordRepFlat* new_node = CordRepFlat::New(extra);
+  new_node->length = std::min(new_node->Capacity(), length);
   *region = new_node->Data();
   *size = new_node->length;
 
   if (cord_ring_enabled()) {
-    replace_tree(CordRepRing::Append(ForceRing(root, 1), new_node));
-    return;
+    rep = CordRepRing::Append(ForceRing(rep, 1), new_node);
+  } else {
+    rep = Concat(rep, new_node);
   }
-  replace_tree(Concat(root, new_node));
+  CommitTree(root, rep, scope, method);
 }
 
 // If the rep is a leaf, this will increment the value at total_mem_usage and
@@ -484,68 +516,67 @@ static bool RepMemoryUsageLeaf(const CordRep* rep, size_t* total_mem_usage) {
     return true;
   }
   if (rep->tag == EXTERNAL) {
-    *total_mem_usage += sizeof(CordRepConcat) + rep->length;
+    // We don't know anything about the embedded / bound data, but we can safely
+    // assume it is 'at least' a word / pointer to data. In the future we may
+    // choose to use the 'data' byte as a tag to identify the types of some
+    // well-known externals, such as a std::string instance.
+    *total_mem_usage +=
+        sizeof(cord_internal::CordRepExternalImpl<intptr_t>) + rep->length;
     return true;
   }
   return false;
 }
 
 void Cord::InlineRep::AssignSlow(const Cord::InlineRep& src) {
-  ClearSlow();
+  assert(&src != this);
+  assert(is_tree() || src.is_tree());
+  auto constexpr method = CordzUpdateTracker::kAssignCord;
+  if (ABSL_PREDICT_TRUE(!is_tree())) {
+    EmplaceTree(CordRep::Ref(src.as_tree()), src.data_, method);
+    return;
+  }
 
-  data_ = src.data_;
-  if (is_tree()) {
-    data_.set_profiled(false);
-    CordRep::Ref(tree());
-    clear_cordz_info();
+  CordRep* tree = as_tree();
+  if (CordRep* src_tree = src.tree()) {
+    // Leave any existing `cordz_info` in place, and let MaybeTrackCord()
+    // decide if this cord should be (or remains to be) sampled or not.
+    data_.set_tree(CordRep::Ref(src_tree));
+    CordzInfo::MaybeTrackCord(data_, src.data_, method);
+  } else {
+    CordzInfo::MaybeUntrackCord(data_.cordz_info());
+    data_ = src.data_;
   }
+  CordRep::Unref(tree);
 }
 
-void Cord::InlineRep::ClearSlow() {
+void Cord::InlineRep::UnrefTree() {
   if (is_tree()) {
+    CordzInfo::MaybeUntrackCord(data_.cordz_info());
     CordRep::Unref(tree());
   }
-  ResetToEmpty();
 }
 
 // --------------------------------------------------------------------
 // Constructors and destructors
 
-Cord::Cord(absl::string_view src) {
+Cord::Cord(absl::string_view src, MethodIdentifier method)
+    : contents_(InlineData::kDefaultInit) {
   const size_t n = src.size();
   if (n <= InlineRep::kMaxInline) {
-    contents_.set_data(src.data(), n, false);
+    contents_.set_data(src.data(), n, true);
   } else {
-    contents_.set_tree(NewTree(src.data(), n, 0));
+    CordRep* rep = NewTree(src.data(), n, 0);
+    contents_.EmplaceTree(rep, method);
   }
 }
 
 template <typename T, Cord::EnableIfString<T>>
-Cord::Cord(T&& src) {
-  if (
-      // String is short: copy data to avoid external block overhead.
-      src.size() <= kMaxBytesToCopy ||
-      // String is wasteful: copy data to avoid pinning too much unused memory.
-      src.size() < src.capacity() / 2
-  ) {
-    if (src.size() <= InlineRep::kMaxInline) {
-      contents_.set_data(src.data(), src.size(), false);
-    } else {
-      contents_.set_tree(NewTree(src.data(), src.size(), 0));
-    }
+Cord::Cord(T&& src) : contents_(InlineData::kDefaultInit) {
+  if (src.size() <= InlineRep::kMaxInline) {
+    contents_.set_data(src.data(), src.size(), true);
   } else {
-    struct StringReleaser {
-      void operator()(absl::string_view /* data */) {}
-      std::string data;
-    };
-    const absl::string_view original_data = src;
-    auto* rep = static_cast<
-        ::absl::cord_internal::CordRepExternalImpl<StringReleaser>*>(
-        absl::cord_internal::NewExternalRep(
-            original_data, StringReleaser{std::forward<T>(src)}));
-    // Moving src may have invalidated its data pointer, so adjust it.
-    rep->base = rep->template get<0>().data.data();
-    contents_.set_tree(rep);
+    CordRep* rep = CordRepFromString(std::forward<T>(src));
+    contents_.EmplaceTree(rep, CordzUpdateTracker::kConstructorString);
   }
 }
 
@@ -554,9 +585,9 @@ template Cord::Cord(std::string&& src);
 // The destruction code is separate so that the compiler can determine
 // that it does not need to call the destructor on a moved-from Cord.
 void Cord::DestroyCordSlow() {
-  if (CordRep* tree = contents_.tree()) {
-    CordRep::Unref(VerifyTree(tree));
-  }
+  assert(contents_.is_tree());
+  CordzInfo::MaybeUntrackCord(contents_.cordz_info());
+  CordRep::Unref(VerifyTree(contents_.as_tree()));
 }
 
 // --------------------------------------------------------------------
@@ -568,109 +599,117 @@ void Cord::Clear() {
   }
 }
 
-Cord& Cord::operator=(absl::string_view src) {
+Cord& Cord::AssignLargeString(std::string&& src) {
+  auto constexpr method = CordzUpdateTracker::kAssignString;
+  assert(src.size() > kMaxBytesToCopy);
+  CordRep* rep = CordRepFromString(std::move(src));
+  if (CordRep* tree = contents_.tree()) {
+    CordzUpdateScope scope(contents_.cordz_info(), method);
+    contents_.SetTree(rep, scope);
+    CordRep::Unref(tree);
+  } else {
+    contents_.EmplaceTree(rep, method);
+  }
+  return *this;
+}
 
+Cord& Cord::operator=(absl::string_view src) {
+  auto constexpr method = CordzUpdateTracker::kAssignString;
   const char* data = src.data();
   size_t length = src.size();
   CordRep* tree = contents_.tree();
   if (length <= InlineRep::kMaxInline) {
-    // Embed into this->contents_
+    // Embed into this->contents_, which is somewhat subtle:
+    // - MaybeUntrackCord must be called before Unref(tree).
+    // - MaybeUntrackCord must be called before set_data() clobbers cordz_info.
+    // - set_data() must be called before Unref(tree) as it may reference tree.
+    if (tree != nullptr) CordzInfo::MaybeUntrackCord(contents_.cordz_info());
     contents_.set_data(data, length, true);
-    if (tree) CordRep::Unref(tree);
-    return *this;
-  }
-  if (tree != nullptr && tree->tag >= FLAT &&
-      tree->flat()->Capacity() >= length &&
-      tree->refcount.IsOne()) {
-    // Copy in place if the existing FLAT node is reusable.
-    memmove(tree->flat()->Data(), data, length);
-    tree->length = length;
-    VerifyTree(tree);
+    if (tree != nullptr) CordRep::Unref(tree);
     return *this;
   }
-  contents_.set_tree(NewTree(data, length, 0));
-  if (tree) CordRep::Unref(tree);
-  return *this;
-}
-
-template <typename T, Cord::EnableIfString<T>>
-Cord& Cord::operator=(T&& src) {
-  if (src.size() <= kMaxBytesToCopy) {
-    *this = absl::string_view(src);
+  if (tree != nullptr) {
+    CordzUpdateScope scope(contents_.cordz_info(), method);
+    if (tree->tag >= FLAT && tree->flat()->Capacity() >= length &&
+        tree->refcount.IsOne()) {
+      // Copy in place if the existing FLAT node is reusable.
+      memmove(tree->flat()->Data(), data, length);
+      tree->length = length;
+      VerifyTree(tree);
+      return *this;
+    }
+    contents_.SetTree(NewTree(data, length, 0), scope);
+    CordRep::Unref(tree);
   } else {
-    *this = Cord(std::forward<T>(src));
+    contents_.EmplaceTree(NewTree(data, length, 0), method);
   }
   return *this;
 }
 
-template Cord& Cord::operator=(std::string&& src);
-
 // TODO(sanjay): Move to Cord::InlineRep section of file.  For now,
 // we keep it here to make diffs easier.
-void Cord::InlineRep::AppendArray(const char* src_data, size_t src_size) {
-  if (src_size == 0) return;  // memcpy(_, nullptr, 0) is undefined.
+void Cord::InlineRep::AppendArray(absl::string_view src,
+                                  MethodIdentifier method) {
+  if (src.empty()) return;  // memcpy(_, nullptr, 0) is undefined.
 
   size_t appended = 0;
-  CordRep* root = nullptr;
-  if (is_tree()) {
-    root = data_.as_tree();
+  CordRep* rep = tree();
+  const CordRep* const root = rep;
+  CordzUpdateScope scope(root ? cordz_info() : nullptr, method);
+  if (root != nullptr) {
     char* region;
-    if (PrepareAppendRegion(root, &region, &appended, src_size)) {
-      memcpy(region, src_data, appended);
+    if (PrepareAppendRegion(rep, &region, &appended, src.size())) {
+      memcpy(region, src.data(), appended);
     }
   } else {
     // Try to fit in the inline buffer if possible.
     size_t inline_length = inline_size();
-    if (src_size <= kMaxInline - inline_length) {
+    if (src.size() <= kMaxInline - inline_length) {
       // Append new data to embedded array
-      memcpy(data_.as_chars() + inline_length, src_data, src_size);
-      set_inline_size(inline_length + src_size);
+      memcpy(data_.as_chars() + inline_length, src.data(), src.size());
+      set_inline_size(inline_length + src.size());
       return;
     }
 
-    // It is possible that src_data == data_, but when we transition from an
-    // InlineRep to a tree we need to assign data_ = root via set_tree. To
-    // avoid corrupting the source data before we copy it, delay calling
-    // set_tree until after we've copied data.
+    // Note: we don't concern ourselves if src aliases data stored in the
+    // inlined data of 'this',  as we update the InlineData only at the end.
     // We are going from an inline size to beyond inline size. Make the new size
     // either double the inlined size, or the added size + 10%.
-    const size_t size1 = inline_length * 2 + src_size;
-    const size_t size2 = inline_length + src_size / 10;
-    root = CordRepFlat::New(std::max<size_t>(size1, size2));
-    appended = std::min(
-        src_size, root->flat()->Capacity() - inline_length);
-    memcpy(root->flat()->Data(), data_.as_chars(), inline_length);
-    memcpy(root->flat()->Data() + inline_length, src_data, appended);
-    root->length = inline_length + appended;
-    set_tree(root);
-  }
-
-  src_data += appended;
-  src_size -= appended;
-  if (src_size == 0) {
+    const size_t size1 = inline_length * 2 + src.size();
+    const size_t size2 = inline_length + src.size() / 10;
+    rep = CordRepFlat::New(std::max<size_t>(size1, size2));
+    appended = std::min(src.size(), rep->flat()->Capacity() - inline_length);
+    memcpy(rep->flat()->Data(), data_.as_chars(), inline_length);
+    memcpy(rep->flat()->Data() + inline_length, src.data(), appended);
+    rep->length = inline_length + appended;
+  }
+
+  src.remove_prefix(appended);
+  if (src.empty()) {
+    CommitTree(root, rep, scope, method);
     return;
   }
 
   if (cord_ring_enabled()) {
-    absl::string_view data(src_data, src_size);
-    root = ForceRing(root, (data.size() - 1) / kMaxFlatLength + 1);
-    replace_tree(CordRepRing::Append(root->ring(), data));
-    return;
-  }
-
-  // Use new block(s) for any remaining bytes that were not handled above.
-  // Alloc extra memory only if the right child of the root of the new tree is
-  // going to be a FLAT node, which will permit further inplace appends.
-  size_t length = src_size;
-  if (src_size < kMaxFlatLength) {
-    // The new length is either
-    // - old size + 10%
-    // - old_size + src_size
-    // This will cause a reasonable conservative step-up in size that is still
-    // large enough to avoid excessive amounts of small fragments being added.
-    length = std::max<size_t>(root->length / 10, src_size);
+    rep = ForceRing(rep, (src.size() - 1) / kMaxFlatLength + 1);
+    rep = CordRepRing::Append(rep->ring(), src);
+  } else {
+    // Use new block(s) for any remaining bytes that were not handled above.
+    // Alloc extra memory only if the right child of the root of the new tree
+    // is going to be a FLAT node, which will permit further inplace appends.
+    size_t length = src.size();
+    if (src.size() < kMaxFlatLength) {
+      // The new length is either
+      // - old size + 10%
+      // - old_size + src.size()
+      // This will cause a reasonable conservative step-up in size that is
+      // still large enough to avoid excessive amounts of small fragments
+      // being added.
+      length = std::max<size_t>(rep->length / 10, src.size());
+    }
+    rep = Concat(rep, NewTree(src.data(), src.size(), length - src.size()));
   }
-  set_tree(Concat(root, NewTree(src_data, src_size, length - src_size)));
+  CommitTree(root, rep, scope, method);
 }
 
 inline CordRep* Cord::TakeRep() const& {
@@ -685,10 +724,17 @@ inline CordRep* Cord::TakeRep() && {
 
 template <typename C>
 inline void Cord::AppendImpl(C&& src) {
+  auto constexpr method = CordzUpdateTracker::kAppendCord;
   if (empty()) {
-    // In case of an empty destination avoid allocating a new node, do not copy
-    // data.
-    *this = std::forward<C>(src);
+    // Since destination is empty, we can avoid allocating a node,
+    if (src.contents_.is_tree()) {
+      // by taking the tree directly
+      CordRep* rep = std::forward<C>(src).TakeRep();
+      contents_.EmplaceTree(rep, method);
+    } else {
+      // or copying over inline data
+      contents_.data_ = src.contents_.data_;
+    }
     return;
   }
 
@@ -698,12 +744,12 @@ inline void Cord::AppendImpl(C&& src) {
     CordRep* src_tree = src.contents_.tree();
     if (src_tree == nullptr) {
       // src has embedded data.
-      contents_.AppendArray(src.contents_.data(), src_size);
+      contents_.AppendArray({src.contents_.data(), src_size}, method);
       return;
     }
     if (src_tree->tag >= FLAT) {
       // src tree just has one flat node.
-      contents_.AppendArray(src_tree->flat()->Data(), src_size);
+      contents_.AppendArray({src_tree->flat()->Data(), src_size}, method);
       return;
     }
     if (&src == this) {
@@ -719,7 +765,8 @@ inline void Cord::AppendImpl(C&& src) {
   }
 
   // Guaranteed to be a tree (kMaxBytesToCopy > kInlinedSize)
-  contents_.AppendTree(std::forward<C>(src).TakeRep());
+  CordRep* rep = std::forward<C>(src).TakeRep();
+  contents_.AppendTree(rep, CordzUpdateTracker::kAppendCord);
 }
 
 void Cord::Append(const Cord& src) { AppendImpl(src); }
@@ -731,7 +778,8 @@ void Cord::Append(T&& src) {
   if (src.size() <= kMaxBytesToCopy) {
     Append(absl::string_view(src));
   } else {
-    Append(Cord(std::forward<T>(src)));
+    CordRep* rep = CordRepFromString(std::forward<T>(src));
+    contents_.AppendTree(rep, CordzUpdateTracker::kAppendString);
   }
 }
 
@@ -741,7 +789,7 @@ void Cord::Prepend(const Cord& src) {
   CordRep* src_tree = src.contents_.tree();
   if (src_tree != nullptr) {
     CordRep::Ref(src_tree);
-    contents_.PrependTree(src_tree);
+    contents_.PrependTree(src_tree, CordzUpdateTracker::kPrependCord);
     return;
   }
 
@@ -764,7 +812,8 @@ void Cord::Prepend(absl::string_view src) {
       return;
     }
   }
-  contents_.PrependTree(NewTree(src.data(), src.size(), 0));
+  CordRep* rep = NewTree(src.data(), src.size(), 0);
+  contents_.PrependTree(rep, CordzUpdateTracker::kPrependString);
 }
 
 template <typename T, Cord::EnableIfString<T>>
@@ -772,7 +821,8 @@ inline void Cord::Prepend(T&& src) {
   if (src.size() <= kMaxBytesToCopy) {
     Prepend(absl::string_view(src));
   } else {
-    Prepend(Cord(std::forward<T>(src)));
+    CordRep* rep = CordRepFromString(std::forward<T>(src));
+    contents_.PrependTree(rep, CordzUpdateTracker::kPrependString);
   }
 }
 
@@ -870,12 +920,17 @@ void Cord::RemovePrefix(size_t n) {
   CordRep* tree = contents_.tree();
   if (tree == nullptr) {
     contents_.remove_prefix(n);
-  } else if (tree->tag == RING) {
-    contents_.replace_tree(CordRepRing::RemovePrefix(tree->ring(), n));
   } else {
-    CordRep* newrep = RemovePrefixFrom(tree, n);
-    CordRep::Unref(tree);
-    contents_.replace_tree(VerifyTree(newrep));
+    auto constexpr method = CordzUpdateTracker::kRemovePrefix;
+    CordzUpdateScope scope(contents_.cordz_info(), method);
+    if (tree->tag == RING) {
+      tree = CordRepRing::RemovePrefix(tree->ring(), n);
+    } else {
+      CordRep* newrep = RemovePrefixFrom(tree, n);
+      CordRep::Unref(tree);
+      tree = VerifyTree(newrep);
+    }
+    contents_.SetTreeOrEmpty(tree, scope);
   }
 }
 
@@ -886,12 +941,17 @@ void Cord::RemoveSuffix(size_t n) {
   CordRep* tree = contents_.tree();
   if (tree == nullptr) {
     contents_.reduce_size(n);
-  } else if (tree->tag == RING) {
-    contents_.replace_tree(CordRepRing::RemoveSuffix(tree->ring(), n));
   } else {
-    CordRep* newrep = RemoveSuffixFrom(tree, n);
-    CordRep::Unref(tree);
-    contents_.replace_tree(VerifyTree(newrep));
+    auto constexpr method = CordzUpdateTracker::kRemoveSuffix;
+    CordzUpdateScope scope(contents_.cordz_info(), method);
+    if (tree->tag == RING) {
+      tree = CordRepRing::RemoveSuffix(tree->ring(), n);
+    } else {
+      CordRep* newrep = RemoveSuffixFrom(tree, n);
+      CordRep::Unref(tree);
+      tree = VerifyTree(newrep);
+    }
+    contents_.SetTreeOrEmpty(tree, scope);
   }
 }
 
@@ -951,17 +1011,20 @@ Cord Cord::Subcord(size_t pos, size_t new_size) const {
   size_t length = size();
   if (pos > length) pos = length;
   if (new_size > length - pos) new_size = length - pos;
+  if (new_size == 0) return sub_cord;
+
   CordRep* tree = contents_.tree();
   if (tree == nullptr) {
     // sub_cord is newly constructed, no need to re-zero-out the tail of
     // contents_ memory.
     sub_cord.contents_.set_data(contents_.data() + pos, new_size, false);
-  } else if (new_size == 0) {
-    // We want to return empty subcord, so nothing to do.
-  } else if (new_size <= InlineRep::kMaxInline) {
+    return sub_cord;
+  }
+
+  if (new_size <= InlineRep::kMaxInline) {
+    char* dest = sub_cord.contents_.data_.as_chars();
     Cord::ChunkIterator it = chunk_begin();
     it.AdvanceBytes(pos);
-    char* dest = sub_cord.contents_.data_.as_chars();
     size_t remaining_size = new_size;
     while (remaining_size > it->size()) {
       cord_internal::SmallMemmove(dest, it->data(), it->size());
@@ -971,12 +1034,17 @@ Cord Cord::Subcord(size_t pos, size_t new_size) const {
     }
     cord_internal::SmallMemmove(dest, it->data(), remaining_size);
     sub_cord.contents_.set_inline_size(new_size);
-  } else if (tree->tag == RING) {
-    tree = CordRepRing::SubRing(CordRep::Ref(tree)->ring(), pos, new_size);
-    sub_cord.contents_.set_tree(tree);
+    return sub_cord;
+  }
+
+  if (tree->tag == RING) {
+    CordRepRing* ring = CordRep::Ref(tree)->ring();
+    tree = CordRepRing::SubRing(ring, pos, new_size);
   } else {
-    sub_cord.contents_.set_tree(NewSubRange(tree, pos, new_size));
+    tree = NewSubRange(tree, pos, new_size);
   }
+  sub_cord.contents_.EmplaceTree(tree, contents_.data_,
+                                 CordzUpdateTracker::kSubCord);
   return sub_cord;
 }
 
@@ -1418,6 +1486,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) {
   ABSL_HARDENING_ASSERT(bytes_remaining_ >= n &&
                         "Attempted to iterate past `end()`");
   Cord subcord;
+  auto constexpr method = CordzUpdateTracker::kCordReader;
 
   if (n <= InlineRep::kMaxInline) {
     // Range to read fits in inline data. Flatten it.
@@ -1440,11 +1509,12 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) {
   if (ring_reader_) {
     size_t chunk_size = current_chunk_.size();
     if (n <= chunk_size && n <= kMaxBytesToCopy) {
-      subcord = Cord(current_chunk_.substr(0, n));
+      subcord = Cord(current_chunk_.substr(0, n), method);
     } else {
       auto* ring = CordRep::Ref(ring_reader_.ring())->ring();
       size_t offset = ring_reader_.length() - bytes_remaining_;
-      subcord.contents_.set_tree(CordRepRing::SubRing(ring, offset, n));
+      CordRep* rep = CordRepRing::SubRing(ring, offset, n);
+      subcord.contents_.EmplaceTree(rep, method);
     }
     if (n < chunk_size) {
       bytes_remaining_ -= n;
@@ -1463,7 +1533,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) {
     const char* data = subnode->tag == EXTERNAL ? subnode->external()->base
                                                 : subnode->flat()->Data();
     subnode = NewSubstring(subnode, current_chunk_.data() - data, n);
-    subcord.contents_.set_tree(VerifyTree(subnode));
+    subcord.contents_.EmplaceTree(VerifyTree(subnode), method);
     RemoveChunkPrefix(n);
     return subcord;
   }
@@ -1506,7 +1576,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) {
   if (node == nullptr) {
     // We have reached the end of the Cord.
     assert(bytes_remaining_ == 0);
-    subcord.contents_.set_tree(VerifyTree(subnode));
+    subcord.contents_.EmplaceTree(VerifyTree(subnode), method);
     return subcord;
   }
 
@@ -1546,7 +1616,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) {
   current_chunk_ = absl::string_view(data + offset + n, length - n);
   current_leaf_ = node;
   bytes_remaining_ -= n;
-  subcord.contents_.set_tree(VerifyTree(subnode));
+  subcord.contents_.EmplaceTree(VerifyTree(subnode), method);
   return subcord;
 }
 
@@ -1653,6 +1723,7 @@ char Cord::operator[](size_t i) const {
 }
 
 absl::string_view Cord::FlattenSlowPath() {
+  assert(contents_.is_tree());
   size_t total_size = size();
   CordRep* new_rep;
   char* new_buffer;
@@ -1673,10 +1744,9 @@ absl::string_view Cord::FlattenSlowPath() {
                                             s.size());
         });
   }
-  if (CordRep* tree = contents_.tree()) {
-    CordRep::Unref(tree);
-  }
-  contents_.set_tree(new_rep);
+  CordzUpdateScope scope(contents_.cordz_info(), CordzUpdateTracker::kFlatten);
+  CordRep::Unref(contents_.as_tree());
+  contents_.SetTree(new_rep, scope);
   return absl::string_view(new_buffer, total_size);
 }
 
@@ -1688,6 +1758,8 @@ absl::string_view Cord::FlattenSlowPath() {
   } else if (rep->tag == EXTERNAL) {
     *fragment = absl::string_view(rep->external()->base, rep->length);
     return true;
+  } else if (rep->tag == RING) {
+    return rep->ring()->IsFlat(fragment);
   } else if (rep->tag == SUBSTRING) {
     CordRep* child = rep->substring()->child;
     if (child->tag >= FLAT) {
@@ -1698,6 +1770,9 @@ absl::string_view Cord::FlattenSlowPath() {
       *fragment = absl::string_view(
           child->external()->base + rep->substring()->start, rep->length);
       return true;
+    } else if (child->tag == RING) {
+      return child->ring()->IsFlat(rep->substring()->start, rep->length,
+                                   fragment);
     }
   }
   return false;
@@ -1786,8 +1861,7 @@ static void DumpNode(CordRep* rep, bool include_data, std::ostream* os,
           *os << absl::CEscape(std::string(rep->external()->base, rep->length));
         *os << "]\n";
       } else if (rep->tag >= FLAT) {
-        *os << "FLAT cap=" << rep->flat()->Capacity()
-            << " [";
+        *os << "FLAT cap=" << rep->flat()->Capacity() << " [";
         if (include_data)
           *os << absl::CEscape(std::string(rep->flat()->Data(), rep->length));
         *os << "]\n";
@@ -1799,7 +1873,7 @@ static void DumpNode(CordRep* rep, bool include_data, std::ostream* os,
         do {
           DumpNode(ring->entry_child(head), include_data, os,
                    indent + kIndentStep);
-          head = ring->advance(head);;
+          head = ring->advance(head);
         } while (head != ring->tail());
       }
       if (stack.empty()) break;
@@ -1845,9 +1919,8 @@ static bool VerifyNode(CordRep* root, CordRep* start_node,
         worklist.push_back(node->concat()->left);
       }
     } else if (node->tag >= FLAT) {
-      ABSL_INTERNAL_CHECK(
-          node->length <= node->flat()->Capacity(),
-          ReportError(root, node));
+      ABSL_INTERNAL_CHECK(node->length <= node->flat()->Capacity(),
+                          ReportError(root, node));
     } else if (node->tag == EXTERNAL) {
       ABSL_INTERNAL_CHECK(node->external()->base != nullptr,
                           ReportError(root, node));
diff --git a/third_party/abseil-cpp/absl/strings/cord.h b/third_party/abseil-cpp/absl/strings/cord.h
index fa9cb913fd..e758f1cdfb 100644
--- a/third_party/abseil-cpp/absl/strings/cord.h
+++ b/third_party/abseil-cpp/absl/strings/cord.h
@@ -70,6 +70,7 @@
 #include <string>
 #include <type_traits>
 
+#include "absl/base/config.h"
 #include "absl/base/internal/endian.h"
 #include "absl/base/internal/per_thread_tls.h"
 #include "absl/base/macros.h"
@@ -80,6 +81,11 @@
 #include "absl/strings/internal/cord_internal.h"
 #include "absl/strings/internal/cord_rep_ring.h"
 #include "absl/strings/internal/cord_rep_ring_reader.h"
+#include "absl/strings/internal/cordz_functions.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_scope.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
 #include "absl/strings/internal/resize_uninitialized.h"
 #include "absl/strings/internal/string_constant.h"
 #include "absl/strings/string_view.h"
@@ -664,10 +670,24 @@ class Cord {
   explicit constexpr Cord(strings_internal::StringConstant<T>);
 
  private:
+  using CordRep = absl::cord_internal::CordRep;
+  using CordRepFlat = absl::cord_internal::CordRepFlat;
+  using CordzInfo = cord_internal::CordzInfo;
+  using CordzUpdateScope = cord_internal::CordzUpdateScope;
+  using CordzUpdateTracker = cord_internal::CordzUpdateTracker;
+  using InlineData = cord_internal::InlineData;
+  using MethodIdentifier = CordzUpdateTracker::MethodIdentifier;
+
+  // Creates a cord instance with `method` representing the originating
+  // public API call causing the cord to be created.
+  explicit Cord(absl::string_view src, MethodIdentifier method);
+
   friend class CordTestPeer;
   friend bool operator==(const Cord& lhs, const Cord& rhs);
   friend bool operator==(const Cord& lhs, absl::string_view rhs);
 
+  friend const CordzInfo* GetCordzInfoForTesting(const Cord& cord);
+
   // Calls the provided function once for each cord chunk, in order.  Unlike
   // Chunks(), this API will not allocate memory.
   void ForEachChunk(absl::FunctionRef<void(absl::string_view)>) const;
@@ -687,6 +707,7 @@ class Cord {
     static_assert(kMaxInline >= sizeof(absl::cord_internal::CordRep*), "");
 
     constexpr InlineRep() : data_() {}
+    explicit InlineRep(InlineData::DefaultInitType init) : data_(init) {}
     InlineRep(const InlineRep& src);
     InlineRep(InlineRep&& src);
     InlineRep& operator=(const InlineRep& src);
@@ -704,23 +725,56 @@ class Cord {
     // Returns nullptr if holding bytes
     absl::cord_internal::CordRep* tree() const;
     absl::cord_internal::CordRep* as_tree() const;
-    // Discards old pointer, if any
-    void set_tree(absl::cord_internal::CordRep* rep);
-    // Replaces a tree with a new root. This is faster than set_tree, but it
-    // should only be used when it's clear that the old rep was a tree.
-    void replace_tree(absl::cord_internal::CordRep* rep);
     // Returns non-null iff was holding a pointer
     absl::cord_internal::CordRep* clear();
     // Converts to pointer if necessary.
-    absl::cord_internal::CordRep* force_tree(size_t extra_hint);
     void reduce_size(size_t n);  // REQUIRES: holding data
     void remove_prefix(size_t n);  // REQUIRES: holding data
-    void AppendArray(const char* src_data, size_t src_size);
+    void AppendArray(absl::string_view src, MethodIdentifier method);
     absl::string_view FindFlatStartPiece() const;
-    void AppendTree(absl::cord_internal::CordRep* tree);
-    void PrependTree(absl::cord_internal::CordRep* tree);
-    void GetAppendRegion(char** region, size_t* size, size_t max_length);
-    void GetAppendRegion(char** region, size_t* size);
+
+    // Creates a CordRepFlat instance from the current inlined data with `extra'
+    // bytes of desired additional capacity.
+    CordRepFlat* MakeFlatWithExtraCapacity(size_t extra);
+
+    // Sets the tree value for this instance. `rep` must not be null.
+    // Requires the current instance to hold a tree, and a lock to be held on
+    // any CordzInfo referenced by this instance. The latter is enforced through
+    // the CordzUpdateScope argument. If the current instance is sampled, then
+    // the CordzInfo instance is updated to reference the new `rep` value.
+    void SetTree(CordRep* rep, const CordzUpdateScope& scope);
+
+    // Identical to SetTree(), except that `rep` is allowed to be null, in
+    // which case the current instance is reset to an empty value.
+    void SetTreeOrEmpty(CordRep* rep, const CordzUpdateScope& scope);
+
+    // Sets the tree value for this instance, and randomly samples this cord.
+    // This function disregards existing contents in `data_`, and should be
+    // called when a Cord is 'promoted' from an 'uninitialized' or 'inlined'
+    // value to a non-inlined (tree / ring) value.
+    void EmplaceTree(CordRep* rep, MethodIdentifier method);
+
+    // Identical to EmplaceTree, except that it copies the parent stack from
+    // the provided `parent` data if the parent is sampled.
+    void EmplaceTree(CordRep* rep, const InlineData& parent,
+                     MethodIdentifier method);
+
+    // Commits the change of a newly created, or updated `rep` root value into
+    // this cord. `old_rep` indicates the old (inlined or tree) value of the
+    // cord, and determines if the commit invokes SetTree() or EmplaceTree().
+    void CommitTree(const CordRep* old_rep, CordRep* rep,
+                    const CordzUpdateScope& scope, MethodIdentifier method);
+
+    void AppendTreeToInlined(CordRep* tree, MethodIdentifier method);
+    void AppendTreeToTree(CordRep* tree, MethodIdentifier method);
+    void AppendTree(CordRep* tree, MethodIdentifier method);
+    void PrependTreeToInlined(CordRep* tree, MethodIdentifier method);
+    void PrependTreeToTree(CordRep* tree, MethodIdentifier method);
+    void PrependTree(CordRep* tree, MethodIdentifier method);
+
+    template <bool has_length>
+    void GetAppendRegion(char** region, size_t* size, size_t length);
+
     bool IsSame(const InlineRep& other) const {
       return memcmp(&data_, &other.data_, sizeof(data_)) == 0;
     }
@@ -776,8 +830,8 @@ class Cord {
     friend class Cord;
 
     void AssignSlow(const InlineRep& src);
-    // Unrefs the tree, stops profiling, and zeroes the contents
-    void ClearSlow();
+    // Unrefs the tree and stops profiling.
+    void UnrefTree();
 
     void ResetToEmpty() { data_ = {}; }
 
@@ -828,6 +882,10 @@ class Cord {
   template <typename C>
   void AppendImpl(C&& src);
 
+  // Assigns the value in 'src' to this instance, 'stealing' its contents.
+  // Requires src.length() > kMaxBytesToCopy.
+  Cord& AssignLargeString(std::string&& src);
+
   // Helper for AbslHashValue().
   template <typename H>
   H HashFragmented(H hash_state) const {
@@ -930,8 +988,11 @@ inline CordRep* NewExternalRep(absl::string_view data,
 template <typename Releaser>
 Cord MakeCordFromExternal(absl::string_view data, Releaser&& releaser) {
   Cord cord;
-  cord.contents_.set_tree(::absl::cord_internal::NewExternalRep(
-      data, std::forward<Releaser>(releaser)));
+  if (auto* rep = ::absl::cord_internal::NewExternalRep(
+          data, std::forward<Releaser>(releaser))) {
+    cord.contents_.EmplaceTree(rep,
+                               Cord::MethodIdentifier::kMakeCordFromExternal);
+  }
   return cord;
 }
 
@@ -939,15 +1000,16 @@ constexpr Cord::InlineRep::InlineRep(cord_internal::InlineData data)
     : data_(data) {}
 
 inline Cord::InlineRep::InlineRep(const Cord::InlineRep& src)
-    : data_(src.data_) {
-  if (is_tree()) {
-    data_.clear_cordz_info();
-    absl::cord_internal::CordRep::Ref(as_tree());
+    : data_(InlineData::kDefaultInit) {
+  if (CordRep* tree = src.tree()) {
+    EmplaceTree(CordRep::Ref(tree), src.data_,
+                CordzUpdateTracker::kConstructorCord);
+  } else {
+    data_ = src.data_;
   }
 }
 
-inline Cord::InlineRep::InlineRep(Cord::InlineRep&& src) {
-  data_ = src.data_;
+inline Cord::InlineRep::InlineRep(Cord::InlineRep&& src) : data_(src.data_) {
   src.ResetToEmpty();
 }
 
@@ -966,7 +1028,7 @@ inline Cord::InlineRep& Cord::InlineRep::operator=(const Cord::InlineRep& src) {
 inline Cord::InlineRep& Cord::InlineRep::operator=(
     Cord::InlineRep&& src) noexcept {
   if (is_tree()) {
-    ClearSlow();
+    UnrefTree();
   }
   data_ = src.data_;
   src.ResetToEmpty();
@@ -1003,31 +1065,62 @@ inline size_t Cord::InlineRep::size() const {
   return is_tree() ? as_tree()->length : inline_size();
 }
 
-inline void Cord::InlineRep::set_tree(absl::cord_internal::CordRep* rep) {
-  if (rep == nullptr) {
-    ResetToEmpty();
+inline cord_internal::CordRepFlat* Cord::InlineRep::MakeFlatWithExtraCapacity(
+    size_t extra) {
+  static_assert(cord_internal::kMinFlatLength >= sizeof(data_), "");
+  size_t len = data_.inline_size();
+  auto* result = CordRepFlat::New(len + extra);
+  result->length = len;
+  memcpy(result->Data(), data_.as_chars(), sizeof(data_));
+  return result;
+}
+
+inline void Cord::InlineRep::EmplaceTree(CordRep* rep,
+                                         MethodIdentifier method) {
+  assert(rep);
+  data_.make_tree(rep);
+  CordzInfo::MaybeTrackCord(data_, method);
+}
+
+inline void Cord::InlineRep::EmplaceTree(CordRep* rep, const InlineData& parent,
+                                         MethodIdentifier method) {
+  data_.make_tree(rep);
+  CordzInfo::MaybeTrackCord(data_, parent, method);
+}
+
+inline void Cord::InlineRep::SetTree(CordRep* rep,
+                                     const CordzUpdateScope& scope) {
+  assert(rep);
+  assert(data_.is_tree());
+  data_.set_tree(rep);
+  scope.SetCordRep(rep);
+}
+
+inline void Cord::InlineRep::SetTreeOrEmpty(CordRep* rep,
+                                            const CordzUpdateScope& scope) {
+  assert(data_.is_tree());
+  if (rep) {
+    data_.set_tree(rep);
   } else {
-    if (data_.is_tree()) {
-      // `data_` already holds a 'tree' value and an optional cordz_info value.
-      // Replace the tree value only, leaving the cordz_info value unchanged.
-      data_.set_tree(rep);
-    } else {
-      // `data_` contains inlined data: initialize data_ to tree value `rep`.
-      data_.make_tree(rep);
-    }
+    data_ = {};
   }
+  scope.SetCordRep(rep);
 }
 
-inline void Cord::InlineRep::replace_tree(absl::cord_internal::CordRep* rep) {
-  ABSL_ASSERT(is_tree());
-  if (ABSL_PREDICT_FALSE(rep == nullptr)) {
-    set_tree(rep);
-    return;
+inline void Cord::InlineRep::CommitTree(const CordRep* old_rep, CordRep* rep,
+                                        const CordzUpdateScope& scope,
+                                        MethodIdentifier method) {
+  if (old_rep) {
+    SetTree(rep, scope);
+  } else {
+    EmplaceTree(rep, method);
   }
-  data_.set_tree(rep);
 }
 
 inline absl::cord_internal::CordRep* Cord::InlineRep::clear() {
+  if (is_tree()) {
+    CordzInfo::MaybeUntrackCord(cordz_info());
+  }
   absl::cord_internal::CordRep* result = tree();
   ResetToEmpty();
   return result;
@@ -1042,6 +1135,9 @@ inline void Cord::InlineRep::CopyToArray(char* dst) const {
 
 constexpr inline Cord::Cord() noexcept {}
 
+inline Cord::Cord(absl::string_view src)
+    : Cord(src, CordzUpdateTracker::kConstructorString) {}
+
 template <typename T>
 constexpr Cord::Cord(strings_internal::StringConstant<T>)
     : contents_(strings_internal::StringConstant<T>::value.size() <=
@@ -1057,6 +1153,15 @@ inline Cord& Cord::operator=(const Cord& x) {
   return *this;
 }
 
+template <typename T, Cord::EnableIfString<T>>
+Cord& Cord::operator=(T&& src) {
+  if (src.size() <= cord_internal::kMaxBytesToCopy) {
+    return operator=(absl::string_view(src));
+  } else {
+    return AssignLargeString(std::forward<T>(src));
+  }
+}
+
 inline Cord::Cord(const Cord& src) : contents_(src.contents_) {}
 
 inline Cord::Cord(Cord&& src) noexcept : contents_(std::move(src.contents_)) {}
@@ -1071,7 +1176,6 @@ inline Cord& Cord::operator=(Cord&& x) noexcept {
 }
 
 extern template Cord::Cord(std::string&& src);
-extern template Cord& Cord::operator=(std::string&& src);
 
 inline size_t Cord::size() const {
   // Length is 1st field in str.rep_
@@ -1114,7 +1218,7 @@ inline absl::string_view Cord::Flatten() {
 }
 
 inline void Cord::Append(absl::string_view src) {
-  contents_.AppendArray(src.data(), src.size());
+  contents_.AppendArray(src, CordzUpdateTracker::kAppendString);
 }
 
 extern template void Cord::Append(std::string&& src);
diff --git a/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc b/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc
index 585616f3c0..d9a9a76d1e 100644
--- a/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc
+++ b/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc
@@ -78,6 +78,7 @@ TEST(CordRingReaderTest, Reset) {
   EXPECT_TRUE(static_cast<bool>(reader));
   EXPECT_THAT(reader.ring(), Eq(ring));
   EXPECT_THAT(reader.index(), Eq(ring->head()));
+  EXPECT_THAT(reader.node(), Eq(ring->entry_child(ring->head())));
   EXPECT_THAT(reader.length(), Eq(ring->length));
   EXPECT_THAT(reader.consumed(), Eq(flats[0].length()));
   EXPECT_THAT(reader.remaining(), Eq(ring->length - reader.consumed()));
@@ -99,11 +100,13 @@ TEST(CordRingReaderTest, Next) {
   size_t consumed = reader.consumed();
   size_t remaining = reader.remaining();
   for (int i = 1; i < flats.size(); ++i) {
+    CordRepRing::index_type index = ring->advance(head, i);
     consumed += flats[i].length();
     remaining -= flats[i].length();
     absl::string_view next = reader.Next();
     ASSERT_THAT(next, Eq(flats[i]));
-    ASSERT_THAT(reader.index(), Eq(ring->advance(head, i)));
+    ASSERT_THAT(reader.index(), Eq(index));
+    ASSERT_THAT(reader.node(), Eq(ring->entry_child(index)));
     ASSERT_THAT(reader.consumed(), Eq(consumed));
     ASSERT_THAT(reader.remaining(), Eq(remaining));
   }
@@ -125,13 +128,15 @@ TEST(CordRingReaderTest, SeekForward) {
   size_t consumed = 0;
   size_t remaining = ring->length;;
   for (int i = 0; i < flats.size(); ++i) {
+    CordRepRing::index_type index = ring->advance(head, i);
     size_t offset = consumed;
     consumed += flats[i].length();
     remaining -= flats[i].length();
     for (int off = 0; off < flats[i].length(); ++off) {
       absl::string_view chunk = reader.Seek(offset + off);
       ASSERT_THAT(chunk, Eq(flats[i].substr(off)));
-      ASSERT_THAT(reader.index(), Eq(ring->advance(head, i)));
+      ASSERT_THAT(reader.index(), Eq(index));
+      ASSERT_THAT(reader.node(), Eq(ring->entry_child(index)));
       ASSERT_THAT(reader.consumed(), Eq(consumed));
       ASSERT_THAT(reader.remaining(), Eq(remaining));
     }
@@ -150,11 +155,13 @@ TEST(CordRingReaderTest, SeekBackward) {
   size_t consumed = ring->length;
   size_t remaining = 0;
   for (int i = flats.size() - 1; i >= 0; --i) {
+    CordRepRing::index_type index = ring->advance(head, i);
     size_t offset = consumed - flats[i].length();
     for (int off = 0; off < flats[i].length(); ++off) {
       absl::string_view chunk = reader.Seek(offset + off);
       ASSERT_THAT(chunk, Eq(flats[i].substr(off)));
-      ASSERT_THAT(reader.index(), Eq(ring->advance(head, i)));
+      ASSERT_THAT(reader.index(), Eq(index));
+      ASSERT_THAT(reader.node(), Eq(ring->entry_child(index)));
       ASSERT_THAT(reader.consumed(), Eq(consumed));
       ASSERT_THAT(reader.remaining(), Eq(remaining));
     }
diff --git a/third_party/abseil-cpp/absl/strings/cord_ring_test.cc b/third_party/abseil-cpp/absl/strings/cord_ring_test.cc
index 7d75e106e7..cc8fbaf995 100644
--- a/third_party/abseil-cpp/absl/strings/cord_ring_test.cc
+++ b/third_party/abseil-cpp/absl/strings/cord_ring_test.cc
@@ -31,9 +31,6 @@
 
 extern thread_local bool cord_ring;
 
-// TOOD(b/177688959): weird things happened with the original test
-#define ASAN_BUG_177688959_FIXED false
-
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace {
@@ -101,15 +98,22 @@ using TestParams = std::vector<TestParam>;
 // Matcher validating when mutable copies are required / performed.
 MATCHER_P2(EqIfPrivate, param, rep,
            absl::StrCat("Equal 0x", absl::Hex(rep), " if private")) {
-  return param.refcount_is_one ? arg == rep : arg != rep;
+  return param.refcount_is_one ? arg == rep : true;
 }
 
 // Matcher validating when mutable copies are required / performed.
 MATCHER_P2(EqIfPrivateAndCapacity, param, rep,
            absl::StrCat("Equal 0x", absl::Hex(rep),
                         " if private and capacity")) {
-  return (param.refcount_is_one && param.with_capacity) ? arg == rep
-                                                        : arg != rep;
+  return (param.refcount_is_one && param.with_capacity) ? arg == rep : true;
+}
+
+// Matcher validating a shared ring was re-allocated. Should only be used for
+// tests doing exactly one update as subsequent updates could return the
+// original (freed and re-used) pointer.
+MATCHER_P2(NeIfShared, param, rep,
+           absl::StrCat("Not equal 0x", absl::Hex(rep), " if shared")) {
+  return param.refcount_is_one ? true : arg != rep;
 }
 
 MATCHER_P2(EqIfInputPrivate, param, rep, "Equal if input is private") {
@@ -340,19 +344,15 @@ std::string TestParamToString(const testing::TestParamInfo<TestParam>& info) {
 class CordRingTest : public testing::Test {
  public:
   ~CordRingTest() override {
-#if ASAN_BUG_177688959_FIXED
     for (CordRep* rep : unrefs_) {
       CordRep::Unref(rep);
     }
-#endif
   }
 
   template <typename CordRepType>
   CordRepType* NeedsUnref(CordRepType* rep) {
     assert(rep);
-#if ASAN_BUG_177688959_FIXED
     unrefs_.push_back(rep);
-#endif
     return rep;
   }
 
@@ -362,26 +362,16 @@ class CordRingTest : public testing::Test {
     return NeedsUnref(rep);
   }
 
-  void Unref(CordRep* rep) {
-#if !ASAN_BUG_177688959_FIXED
-    CordRep::Unref(rep);
-#endif
-  }
-
  private:
-#if ASAN_BUG_177688959_FIXED
   std::vector<CordRep*> unrefs_;
-#endif
 };
 
 class CordRingTestWithParam : public testing::TestWithParam<TestParam> {
  public:
   ~CordRingTestWithParam() override {
-#if ASAN_BUG_177688959_FIXED
     for (CordRep* rep : unrefs_) {
       CordRep::Unref(rep);
     }
-#endif
   }
 
   CordRepRing* CreateWithCapacity(CordRep* child, size_t extra_capacity) {
@@ -400,9 +390,7 @@ class CordRingTestWithParam : public testing::TestWithParam<TestParam> {
   template <typename CordRepType>
   CordRepType* NeedsUnref(CordRepType* rep) {
     assert(rep);
-#if ASAN_BUG_177688959_FIXED
     unrefs_.push_back(rep);
-#endif
     return rep;
   }
 
@@ -412,43 +400,23 @@ class CordRingTestWithParam : public testing::TestWithParam<TestParam> {
     return NeedsUnref(rep);
   }
 
-  void Unref(CordRep* rep) {
-#if !ASAN_BUG_177688959_FIXED
-    CordRep::Unref(rep);
-#endif
-  }
-
   template <typename CordRepType>
   CordRepType* RefIfShared(CordRepType* rep) {
     return Shared() ? Ref(rep) : rep;
   }
 
-  void UnrefIfShared(CordRep* rep) {
-    if (Shared()) Unref(rep);
-  }
-
   template <typename CordRepType>
   CordRepType* RefIfInputShared(CordRepType* rep) {
     return InputShared() ? Ref(rep) : rep;
   }
 
-  void UnrefIfInputShared(CordRep* rep) {
-    if (InputShared()) Unref(rep);
-  }
-
   template <typename CordRepType>
   CordRepType* RefIfInputSharedIndirect(CordRepType* rep) {
     return InputSharedIndirect() ? Ref(rep) : rep;
   }
 
-  void UnrefIfInputSharedIndirect(CordRep* rep) {
-    if (InputSharedIndirect()) Unref(rep);
-  }
-
  private:
-#if ASAN_BUG_177688959_FIXED
   std::vector<CordRep*> unrefs_;
-#endif
 };
 
 class CordRingCreateTest : public CordRingTestWithParam {
@@ -520,26 +488,26 @@ class CordRingBuildInputTest : public CordRingTestWithParam {
   }
 };
 
-INSTANTIATE_TEST_CASE_P(WithParam, CordRingSubTest,
-                        testing::ValuesIn(CordRingSubTest::CreateTestParams()),
-                        TestParamToString);
+INSTANTIATE_TEST_SUITE_P(WithParam, CordRingSubTest,
+                         testing::ValuesIn(CordRingSubTest::CreateTestParams()),
+                         TestParamToString);
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     WithParam, CordRingCreateTest,
     testing::ValuesIn(CordRingCreateTest::CreateTestParams()),
     TestParamToString);
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     WithParam, CordRingCreateFromTreeTest,
     testing::ValuesIn(CordRingCreateFromTreeTest::CreateTestParams()),
     TestParamToString);
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     WithParam, CordRingBuildTest,
     testing::ValuesIn(CordRingBuildTest::CreateTestParams()),
     TestParamToString);
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     WithParam, CordRingBuildInputTest,
     testing::ValuesIn(CordRingBuildInputTest::CreateTestParams()),
     TestParamToString);
@@ -550,7 +518,6 @@ TEST_P(CordRingCreateTest, CreateFromFlat) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(str1.size()));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1));
-  Unref(result);
 }
 
 TEST_P(CordRingCreateTest, CreateFromRing) {
@@ -558,9 +525,8 @@ TEST_P(CordRingCreateTest, CreateFromRing) {
   CordRepRing* result = NeedsUnref(CordRepRing::Create(ring));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAreArray(kFoxFlats));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringRing) {
@@ -570,23 +536,20 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringRing) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfInputPrivate(GetParam(), ring));
   EXPECT_THAT(ToString(result), string_view(kFox).substr(2, 11));
-  UnrefIfInputSharedIndirect(ring);
-  UnrefIfInputShared(sub);
-  Unref(result);
 }
 
 TEST_F(CordRingTest, CreateWithIllegalExtraCapacity) {
-  CordRep* flat = NeedsUnref(MakeFlat("Hello world"));
 #if defined(ABSL_HAVE_EXCEPTIONS)
+  CordRep* flat = NeedsUnref(MakeFlat("Hello world"));
   try {
     CordRepRing::Create(flat, CordRepRing::kMaxCapacity);
     GTEST_FAIL() << "expected std::length_error exception";
   } catch (const std::length_error&) {
   }
 #elif defined(GTEST_HAS_DEATH_TEST)
+  CordRep* flat = NeedsUnref(MakeFlat("Hello world"));
   EXPECT_DEATH(CordRepRing::Create(flat, CordRepRing::kMaxCapacity), ".*");
 #endif
-  Unref(flat);
 }
 
 TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfFlat) {
@@ -597,9 +560,6 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfFlat) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(20));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1.substr(4, 20)));
-  Unref(result);
-  UnrefIfInputShared(flat);
-  UnrefIfInputSharedIndirect(child);
 }
 
 TEST_P(CordRingCreateTest, CreateFromExternal) {
@@ -609,8 +569,6 @@ TEST_P(CordRingCreateTest, CreateFromExternal) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(str1.size()));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1));
-  Unref(result);
-  UnrefIfInputShared(child);
 }
 
 TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfExternal) {
@@ -621,9 +579,6 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfExternal) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(24));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1.substr(1, 24)));
-  Unref(result);
-  UnrefIfInputShared(external);
-  UnrefIfInputSharedIndirect(child);
 }
 
 TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfLargeExternal) {
@@ -637,9 +592,6 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfLargeExternal) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(str.size()));
   EXPECT_THAT(ToRawFlats(result), ElementsAre(str));
-  Unref(result);
-  UnrefIfInputShared(external);
-  UnrefIfInputSharedIndirect(child);
 }
 
 TEST_P(CordRingBuildInputTest, CreateFromConcat) {
@@ -652,10 +604,6 @@ TEST_P(CordRingBuildInputTest, CreateFromConcat) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(26));
   EXPECT_THAT(ToString(result), Eq(kAlphabet));
-  UnrefIfInputSharedIndirect(flats[0]);
-  UnrefIfInputSharedIndirect(flats[3]);
-  UnrefIfInputShared(concat);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, CreateFromSubstringConcat) {
@@ -671,10 +619,6 @@ TEST_P(CordRingBuildInputTest, CreateFromSubstringConcat) {
       ASSERT_THAT(result, IsValidRingBuffer());
       ASSERT_THAT(result->length, Eq(len));
       ASSERT_THAT(ToString(result), string_view(kAlphabet).substr(off, len));
-      UnrefIfInputSharedIndirect(flats[0]);
-      UnrefIfInputSharedIndirect(flats[3]);
-      UnrefIfInputShared(child);
-      Unref(result);
     }
   }
 }
@@ -689,7 +633,6 @@ TEST_P(CordRingCreateTest, Properties) {
   EXPECT_THAT(result->capacity(), Le(2 * 120 + 1));
   EXPECT_THAT(result->entries(), Eq(1));
   EXPECT_THAT(result->begin_pos(), Eq(0));
-  Unref(result);
 }
 
 TEST_P(CordRingCreateTest, EntryForNewFlat) {
@@ -700,7 +643,6 @@ TEST_P(CordRingCreateTest, EntryForNewFlat) {
   EXPECT_THAT(result->entry_child(0), Eq(child));
   EXPECT_THAT(result->entry_end_pos(0), Eq(str1.length()));
   EXPECT_THAT(result->entry_data_offset(0), Eq(0));
-  Unref(result);
 }
 
 TEST_P(CordRingCreateTest, EntryForNewFlatSubstring) {
@@ -712,7 +654,6 @@ TEST_P(CordRingCreateTest, EntryForNewFlatSubstring) {
   EXPECT_THAT(result->entry_child(0), Eq(child));
   EXPECT_THAT(result->entry_end_pos(0), Eq(26));
   EXPECT_THAT(result->entry_data_offset(0), Eq(10));
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendFlat) {
@@ -722,10 +663,9 @@ TEST_P(CordRingBuildTest, AppendFlat) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, MakeFlat(str2)));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size()));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1, str2));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, PrependFlat) {
@@ -735,10 +675,9 @@ TEST_P(CordRingBuildTest, PrependFlat) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, MakeFlat(str2)));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size()));
   EXPECT_THAT(ToFlats(result), ElementsAre(str2, str1));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendString) {
@@ -748,10 +687,9 @@ TEST_P(CordRingBuildTest, AppendString) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, str2));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size()));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1, str2));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendStringHavingExtra) {
@@ -762,8 +700,7 @@ TEST_P(CordRingBuildTest, AppendStringHavingExtra) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size()));
   EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
-  UnrefIfShared(ring);
-  Unref(result);
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
 }
 
 TEST_P(CordRingBuildTest, AppendStringHavingPartialExtra) {
@@ -785,13 +722,12 @@ TEST_P(CordRingBuildTest, AppendStringHavingPartialExtra) {
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size()));
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   if (GetParam().refcount_is_one) {
     EXPECT_THAT(ToFlats(result), ElementsAre(StrCat(str1, str1a), str2a));
   } else {
     EXPECT_THAT(ToFlats(result), ElementsAre(str1, str2));
   }
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendStringHavingExtraInSubstring) {
@@ -802,14 +738,13 @@ TEST_P(CordRingBuildTest, AppendStringHavingExtraInSubstring) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, str2));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(result->length, Eq(4 + str2.size()));
   if (GetParam().refcount_is_one) {
     EXPECT_THAT(ToFlats(result), ElementsAre(StrCat("1234", str2)));
   } else {
     EXPECT_THAT(ToFlats(result), ElementsAre("1234", str2));
   }
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendStringHavingSharedExtra) {
@@ -837,10 +772,9 @@ TEST_P(CordRingBuildTest, AppendStringHavingSharedExtra) {
     CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, str2));
     ASSERT_THAT(result, IsValidRingBuffer());
     EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+    EXPECT_THAT(result, NeIfShared(GetParam(), ring));
     EXPECT_THAT(result->length, Eq(4 + str2.size()));
     EXPECT_THAT(ToFlats(result), ElementsAre("1234", str2));
-    UnrefIfShared(ring);
-    Unref(result);
 
     CordRep::Unref(shared_type == 1 ? flat1 : flat);
   }
@@ -857,8 +791,6 @@ TEST_P(CordRingBuildTest, AppendStringWithExtra) {
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size() + str3.size()));
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre(str1, StrCat(str2, str3)));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, PrependString) {
@@ -875,8 +807,6 @@ TEST_P(CordRingBuildTest, PrependString) {
   }
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size()));
   EXPECT_THAT(ToFlats(result), ElementsAre(str2, str1));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, PrependStringHavingExtra) {
@@ -887,14 +817,13 @@ TEST_P(CordRingBuildTest, PrependStringHavingExtra) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, str2));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(result->length, Eq(4 + str2.size()));
   if (GetParam().refcount_is_one) {
     EXPECT_THAT(ToFlats(result), ElementsAre(StrCat(str2, "1234")));
   } else {
     EXPECT_THAT(ToFlats(result), ElementsAre(str2, "1234"));
   }
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, PrependStringHavingSharedExtra) {
@@ -920,9 +849,8 @@ TEST_P(CordRingBuildTest, PrependStringHavingSharedExtra) {
     ASSERT_THAT(result, IsValidRingBuffer());
     EXPECT_THAT(result->length, Eq(str1a.size() + str2.size()));
     EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+    EXPECT_THAT(result, NeIfShared(GetParam(), ring));
     EXPECT_THAT(ToFlats(result), ElementsAre(str2, str1a));
-    UnrefIfShared(ring);
-    Unref(result);
     CordRep::Unref(shared_type == 1 ? flat1 : flat);
   }
 }
@@ -938,8 +866,6 @@ TEST_P(CordRingBuildTest, PrependStringWithExtra) {
   EXPECT_THAT(result->length, Eq(str1.size() + str2.size() + str3.size()));
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre(StrCat(str3, str2), str1));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendPrependStringMix) {
@@ -950,12 +876,10 @@ TEST_P(CordRingBuildTest, AppendPrependStringMix) {
     result = CordRepRing::Prepend(result, flats[4 - i]);
     result = CordRepRing::Append(result, flats[4 + i]);
   }
-  UnrefIfShared(ring);
   NeedsUnref(result);
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
   EXPECT_THAT(ToString(result), kFox);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendPrependStringMixWithExtra) {
@@ -976,8 +900,6 @@ TEST_P(CordRingBuildTest, AppendPrependStringMixWithExtra) {
     EXPECT_THAT(ToFlats(result), ElementsAre("The quick brown fox ", "jumps ",
                                              "over the lazy dog"));
   }
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendPrependStringMixWithPrependedExtra) {
@@ -998,8 +920,6 @@ TEST_P(CordRingBuildTest, AppendPrependStringMixWithPrependedExtra) {
     EXPECT_THAT(ToFlats(result), ElementsAre("The quick brown fox ", "jumps ",
                                              "over the lazy dog"));
   }
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingSubTest, SubRing) {
@@ -1011,16 +931,14 @@ TEST_P(CordRingSubTest, SubRing) {
     CordRepRing* ring = RefIfShared(FromFlats(flats, composition));
     CordRepRing* result = CordRepRing::SubRing(ring, offset, 0);
     EXPECT_THAT(result, nullptr);
-    UnrefIfShared(ring);
 
     for (size_t len = 1; len < all.size() - offset; ++len) {
       ring = RefIfShared(FromFlats(flats, composition));
       result = NeedsUnref(CordRepRing::SubRing(ring, offset, len));
       ASSERT_THAT(result, IsValidRingBuffer());
       ASSERT_THAT(result, EqIfPrivate(GetParam(), ring));
+      ASSERT_THAT(result, NeIfShared(GetParam(), ring));
       ASSERT_THAT(ToString(result), Eq(all.substr(offset, len)));
-      UnrefIfShared(ring);
-      Unref(result);
     }
   }
 }
@@ -1039,18 +957,16 @@ TEST_P(CordRingSubTest, SubRingFromLargeExternal) {
     CordRepRing* ring = RefIfShared(FromFlats(flats, composition));
     CordRepRing* result = CordRepRing::SubRing(ring, offset, 0);
     EXPECT_THAT(result, nullptr);
-    UnrefIfShared(ring);
 
     for (size_t len = all.size() - 30; len < all.size() - offset; ++len) {
       ring = RefIfShared(FromFlats(flats, composition));
       result = NeedsUnref(CordRepRing::SubRing(ring, offset, len));
       ASSERT_THAT(result, IsValidRingBuffer());
       ASSERT_THAT(result, EqIfPrivate(GetParam(), ring));
+      ASSERT_THAT(result, NeIfShared(GetParam(), ring));
       auto str = ToString(result);
       ASSERT_THAT(str, SizeIs(len));
       ASSERT_THAT(str, Eq(all.substr(offset, len)));
-      UnrefIfShared(ring);
-      Unref(result);
     }
   }
 }
@@ -1063,16 +979,14 @@ TEST_P(CordRingSubTest, RemovePrefix) {
   CordRepRing* ring = RefIfShared(FromFlats(flats, composition));
   CordRepRing* result = CordRepRing::RemovePrefix(ring, all.size());
   EXPECT_THAT(result, nullptr);
-  UnrefIfShared(ring);
 
   for (size_t len = 1; len < all.size(); ++len) {
     ring = RefIfShared(FromFlats(flats, composition));
     result = NeedsUnref(CordRepRing::RemovePrefix(ring, len));
     ASSERT_THAT(result, IsValidRingBuffer());
     EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
+    ASSERT_THAT(result, NeIfShared(GetParam(), ring));
     EXPECT_THAT(ToString(result), Eq(all.substr(len)));
-    UnrefIfShared(ring);
-    Unref(result);
   }
 }
 
@@ -1087,7 +1001,6 @@ TEST_P(CordRingSubTest, RemovePrefixFromLargeExternal) {
       ElementsAre(
           not_a_string_view(external1->base, 1 << 20).remove_prefix(1 << 16),
           not_a_string_view(external2->base, 1 << 20)));
-  Unref(result);
 }
 
 TEST_P(CordRingSubTest, RemoveSuffix) {
@@ -1098,16 +1011,14 @@ TEST_P(CordRingSubTest, RemoveSuffix) {
   CordRepRing* ring = RefIfShared(FromFlats(flats, composition));
   CordRepRing* result = CordRepRing::RemoveSuffix(ring, all.size());
   EXPECT_THAT(result, nullptr);
-  UnrefIfShared(ring);
 
   for (size_t len = 1; len < all.size(); ++len) {
     ring = RefIfShared(FromFlats(flats, composition));
     result = NeedsUnref(CordRepRing::RemoveSuffix(ring, len));
     ASSERT_THAT(result, IsValidRingBuffer());
-    EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
-    EXPECT_THAT(ToString(result), Eq(all.substr(0, all.size() - len)));
-    UnrefIfShared(ring);
-    Unref(result);
+    ASSERT_THAT(result, EqIfPrivate(GetParam(), ring));
+    ASSERT_THAT(result, NeIfShared(GetParam(), ring));
+    ASSERT_THAT(ToString(result), Eq(all.substr(0, all.size() - len)));
   }
 }
 
@@ -1120,9 +1031,8 @@ TEST_P(CordRingSubTest, AppendRing) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, child));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivate(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAreArray(kFoxFlats));
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, AppendRingWithFlatOffset) {
@@ -1135,11 +1045,9 @@ TEST_P(CordRingBuildInputTest, AppendRingWithFlatOffset) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("Head", "brown ", "fox ", "jumps ",
                                            "over ", "the ", "lazy ", "dog"));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, AppendRingWithBrokenOffset) {
@@ -1152,11 +1060,9 @@ TEST_P(CordRingBuildInputTest, AppendRingWithBrokenOffset) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result),
               ElementsAre("Head", "umps ", "over ", "the ", "lazy ", "dog"));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, AppendRingWithFlatLength) {
@@ -1169,11 +1075,9 @@ TEST_P(CordRingBuildInputTest, AppendRingWithFlatLength) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("Head", "The ", "quick ", "brown ",
                                            "fox ", "jumps ", "over ", "the "));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendRingWithBrokenFlatLength) {
@@ -1186,11 +1090,9 @@ TEST_P(CordRingBuildTest, AppendRingWithBrokenFlatLength) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("Head", "The ", "quick ", "brown ",
                                            "fox ", "jumps ", "ov"));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendRingMiddlePiece) {
@@ -1203,11 +1105,9 @@ TEST_P(CordRingBuildTest, AppendRingMiddlePiece) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result),
               ElementsAre("Head", "ck ", "brown ", "fox ", "jum"));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildTest, AppendRingSinglePiece) {
@@ -1220,11 +1120,8 @@ TEST_P(CordRingBuildTest, AppendRingSinglePiece) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("Head", "row"));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfInputShared(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, AppendRingSinglePieceWithPrefix) {
@@ -1241,11 +1138,8 @@ TEST_P(CordRingBuildInputTest, AppendRingSinglePieceWithPrefix) {
   CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("Prepend", "Head", "row"));
-  UnrefIfInputSharedIndirect(child);
-  UnrefIfInputShared(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRing) {
@@ -1258,10 +1152,8 @@ TEST_P(CordRingBuildInputTest, PrependRing) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, child));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAreArray(kFoxFlats));
-  UnrefIfInputShared(child);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingWithFlatOffset) {
@@ -1274,12 +1166,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithFlatOffset) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("brown ", "fox ", "jumps ", "over ",
                                            "the ", "lazy ", "dog", "Tail"));
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingWithBrokenOffset) {
@@ -1291,12 +1180,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithBrokenOffset) {
   CordRep* stripped = RefIfInputSharedIndirect(RemovePrefix(21, child));
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result),
               ElementsAre("umps ", "over ", "the ", "lazy ", "dog", "Tail"));
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingWithFlatLength) {
@@ -1309,12 +1195,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithFlatLength) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("The ", "quick ", "brown ", "fox ",
                                            "jumps ", "over ", "the ", "Tail"));
-  UnrefIfShared(ring);
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingWithBrokenFlatLength) {
@@ -1327,12 +1210,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithBrokenFlatLength) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("The ", "quick ", "brown ", "fox ",
                                            "jumps ", "ov", "Tail"));
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingMiddlePiece) {
@@ -1346,12 +1226,9 @@ TEST_P(CordRingBuildInputTest, PrependRingMiddlePiece) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result),
               ElementsAre("ck ", "brown ", "fox ", "jum", "Tail"));
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingSinglePiece) {
@@ -1364,11 +1241,8 @@ TEST_P(CordRingBuildInputTest, PrependRingSinglePiece) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("row", "Tail"));
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_P(CordRingBuildInputTest, PrependRingSinglePieceWithPrefix) {
@@ -1384,11 +1258,8 @@ TEST_P(CordRingBuildInputTest, PrependRingSinglePieceWithPrefix) {
   CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped));
   ASSERT_THAT(result, IsValidRingBuffer());
   EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring));
+  EXPECT_THAT(result, NeIfShared(GetParam(), ring));
   EXPECT_THAT(ToFlats(result), ElementsAre("row", "Prepend", "Tail"));
-  UnrefIfInputShared(child);
-  UnrefIfInputSharedIndirect(stripped);
-  UnrefIfShared(ring);
-  Unref(result);
 }
 
 TEST_F(CordRingTest, Find) {
@@ -1406,7 +1277,6 @@ TEST_F(CordRingTest, Find) {
     ASSERT_THAT(found.offset, Lt(data.length()));
     ASSERT_THAT(data[found.offset], Eq(value[i]));
   }
-  Unref(ring);
 }
 
 TEST_F(CordRingTest, FindWithHint) {
@@ -1442,7 +1312,6 @@ TEST_F(CordRingTest, FindWithHint) {
     ++flat_pos;
     flat_offset += flat.length();
   }
-  Unref(ring);
 }
 
 TEST_F(CordRingTest, FindInLargeRing) {
@@ -1464,7 +1333,6 @@ TEST_F(CordRingTest, FindInLargeRing) {
     ASSERT_THAT(pos.offset, Lt(data.length()));
     ASSERT_THAT(data[pos.offset], Eq(value[i]));
   }
-  Unref(ring);
 }
 
 TEST_F(CordRingTest, FindTail) {
@@ -1483,7 +1351,6 @@ TEST_F(CordRingTest, FindTail) {
     ASSERT_THAT(pos.offset, Lt(data.length()));
     ASSERT_THAT(data[data.length() - pos.offset - 1], Eq(value[i]));
   }
-  Unref(ring);
 }
 
 TEST_F(CordRingTest, FindTailWithHint) {
@@ -1510,7 +1377,6 @@ TEST_F(CordRingTest, FindTailWithHint) {
     ASSERT_THAT(pos.offset, Lt(data.length()));
     ASSERT_THAT(data[data.length() - pos.offset - 1], Eq(value[i]));
   }
-  Unref(ring);
 }
 
 TEST_F(CordRingTest, FindTailInLargeRing) {
@@ -1532,7 +1398,6 @@ TEST_F(CordRingTest, FindTailInLargeRing) {
     ASSERT_THAT(pos.offset, Lt(data.length()));
     ASSERT_THAT(data[data.length() - pos.offset - 1], Eq(value[i]));
   }
-  Unref(ring);
 }
 
 TEST_F(CordRingTest, GetCharacter) {
@@ -1544,7 +1409,6 @@ TEST_F(CordRingTest, GetCharacter) {
   for (int i = 0; i < value.length(); ++i) {
     ASSERT_THAT(result->GetCharacter(i), Eq(value[i]));
   }
-  Unref(result);
 }
 
 TEST_F(CordRingTest, GetCharacterWithSubstring) {
@@ -1556,7 +1420,67 @@ TEST_F(CordRingTest, GetCharacterWithSubstring) {
   for (int i = 0; i < value.length(); ++i) {
     ASSERT_THAT(result->GetCharacter(i), Eq(value[i]));
   }
-  Unref(result);
+}
+
+TEST_F(CordRingTest, IsFlatSingleFlat) {
+  for (bool external : {false, true}) {
+    SCOPED_TRACE(external ? "With External" : "With Flat");
+    absl::string_view str = "Hello world";
+    CordRep* rep = external ? MakeExternal(str) : MakeFlat(str);
+    CordRepRing* ring = NeedsUnref(CordRepRing::Create(rep));
+
+    // The ring is a single non-fragmented flat:
+    absl::string_view fragment;
+    EXPECT_TRUE(ring->IsFlat(nullptr));
+    EXPECT_TRUE(ring->IsFlat(&fragment));
+    EXPECT_THAT(fragment, Eq("Hello world"));
+    fragment = "";
+    EXPECT_TRUE(ring->IsFlat(0, 11, nullptr));
+    EXPECT_TRUE(ring->IsFlat(0, 11, &fragment));
+    EXPECT_THAT(fragment, Eq("Hello world"));
+
+    // Arbitrary ranges must check true as well.
+    EXPECT_TRUE(ring->IsFlat(1, 4, &fragment));
+    EXPECT_THAT(fragment, Eq("ello"));
+    EXPECT_TRUE(ring->IsFlat(6, 5, &fragment));
+    EXPECT_THAT(fragment, Eq("world"));
+  }
+}
+
+TEST_F(CordRingTest, IsFlatMultiFlat) {
+  for (bool external : {false, true}) {
+    SCOPED_TRACE(external ? "With External" : "With Flat");
+    absl::string_view str1 = "Hello world";
+    absl::string_view str2 = "Halt and catch fire";
+    CordRep* rep1 = external ? MakeExternal(str1) : MakeFlat(str1);
+    CordRep* rep2 = external ? MakeExternal(str2) : MakeFlat(str2);
+    CordRepRing* ring = CordRepRing::Append(CordRepRing::Create(rep1), rep2);
+    NeedsUnref(ring);
+
+    // The ring is fragmented, IsFlat() on the entire cord must be false.
+    EXPECT_FALSE(ring->IsFlat(nullptr));
+    absl::string_view fragment = "Don't touch this";
+    EXPECT_FALSE(ring->IsFlat(&fragment));
+    EXPECT_THAT(fragment, Eq("Don't touch this"));
+
+    // Check for ranges exactly within both flats.
+    EXPECT_TRUE(ring->IsFlat(0, 11, &fragment));
+    EXPECT_THAT(fragment, Eq("Hello world"));
+    EXPECT_TRUE(ring->IsFlat(11, 19, &fragment));
+    EXPECT_THAT(fragment, Eq("Halt and catch fire"));
+
+    // Check for arbitrary partial range inside each flat.
+    EXPECT_TRUE(ring->IsFlat(1, 4, &fragment));
+    EXPECT_THAT(fragment, "ello");
+    EXPECT_TRUE(ring->IsFlat(26, 4, &fragment));
+    EXPECT_THAT(fragment, "fire");
+
+    // Check ranges spanning across both flats
+    fragment = "Don't touch this";
+    EXPECT_FALSE(ring->IsFlat(1, 18, &fragment));
+    EXPECT_FALSE(ring->IsFlat(10, 2, &fragment));
+    EXPECT_THAT(fragment, Eq("Don't touch this"));
+  }
 }
 
 TEST_F(CordRingTest, Dump) {
@@ -1564,7 +1488,6 @@ TEST_F(CordRingTest, Dump) {
   auto flats = MakeSpan(kFoxFlats);
   CordRepRing* ring = NeedsUnref(FromFlats(flats, kPrepend));
   ss << *ring;
-  Unref(ring);
 }
 
 }  // namespace
diff --git a/third_party/abseil-cpp/absl/strings/cord_test.cc b/third_party/abseil-cpp/absl/strings/cord_test.cc
index f9982428b3..14eca15573 100644
--- a/third_party/abseil-cpp/absl/strings/cord_test.cc
+++ b/third_party/abseil-cpp/absl/strings/cord_test.cc
@@ -35,6 +35,7 @@
 #include "absl/base/macros.h"
 #include "absl/container/fixed_array.h"
 #include "absl/strings/cord_test_helpers.h"
+#include "absl/strings/cordz_test_helpers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -187,6 +188,19 @@ class CordTestPeer {
   static cord_internal::CordzInfo* GetCordzInfo(const Cord& c) {
     return c.contents_.cordz_info();
   }
+
+  static Cord MakeSubstring(Cord src, size_t offset, size_t length) {
+    ABSL_RAW_CHECK(src.contents_.is_tree(), "Can not be inlined");
+    Cord cord;
+    auto* rep = new cord_internal::CordRepSubstring;
+    rep->tag = cord_internal::SUBSTRING;
+    rep->child = cord_internal::CordRep::Ref(src.contents_.tree());
+    rep->start = offset;
+    rep->length = length;
+    cord.contents_.EmplaceTree(rep,
+                               cord_internal::CordzUpdateTracker::kSubCord);
+    return cord;
+  }
 };
 
 ABSL_NAMESPACE_END
@@ -227,7 +241,6 @@ TEST(GigabyteCord, FromExternal) {
   // caused crashes in production.  We grow exponentially so that the code will
   // execute in a reasonable amount of time.
   absl::Cord c;
-  ABSL_RAW_LOG(INFO, "Made a Cord with %zu bytes!", c.size());
   c.Append(from);
   while (c.size() < max_size) {
     c.Append(c);
@@ -466,8 +479,8 @@ TEST(TryFlat, SubstrInlined) {
 
 TEST(TryFlat, SubstrFlat) {
   absl::Cord c("longer than 15 bytes");
-  c.RemovePrefix(1);
-  EXPECT_EQ(c.TryFlat(), "onger than 15 bytes");
+  absl::Cord sub = absl::CordTestPeer::MakeSubstring(c, 1, c.size() - 1);
+  EXPECT_EQ(sub.TryFlat(), "onger than 15 bytes");
 }
 
 TEST(TryFlat, Concat) {
@@ -482,16 +495,46 @@ TEST(TryFlat, External) {
 
 TEST(TryFlat, SubstrExternal) {
   absl::Cord c = absl::MakeCordFromExternal("hell", [](absl::string_view) {});
-  c.RemovePrefix(1);
-  EXPECT_EQ(c.TryFlat(), "ell");
+  absl::Cord sub = absl::CordTestPeer::MakeSubstring(c, 1, c.size() - 1);
+  EXPECT_EQ(sub.TryFlat(), "ell");
 }
 
 TEST(TryFlat, SubstrConcat) {
   absl::Cord c = absl::MakeFragmentedCord({"hello", " world"});
+  absl::Cord sub = absl::CordTestPeer::MakeSubstring(c, 1, c.size() - 1);
+  EXPECT_EQ(sub.TryFlat(), absl::nullopt);
   c.RemovePrefix(1);
   EXPECT_EQ(c.TryFlat(), absl::nullopt);
 }
 
+TEST(TryFlat, CommonlyAssumedInvariants) {
+  // The behavior tested below is not part of the API contract of Cord, but it's
+  // something we intend to be true in our current implementation.  This test
+  // exists to detect and prevent accidental breakage of the implementation.
+  absl::string_view fragments[] = {"A fragmented test",
+                                   " cord",
+                                   " to test subcords",
+                                   " of ",
+                                   "a",
+                                   " cord for",
+                                   " each chunk "
+                                   "returned by the ",
+                                   "iterator"};
+  absl::Cord c = absl::MakeFragmentedCord(fragments);
+  int fragment = 0;
+  int offset = 0;
+  absl::Cord::CharIterator itc = c.char_begin();
+  for (absl::string_view sv : c.Chunks()) {
+    absl::string_view expected = fragments[fragment];
+    absl::Cord subcord1 = c.Subcord(offset, sv.length());
+    absl::Cord subcord2 = absl::Cord::AdvanceAndRead(&itc, sv.size());
+    EXPECT_EQ(subcord1.TryFlat(), expected);
+    EXPECT_EQ(subcord2.TryFlat(), expected);
+    ++fragment;
+    offset += sv.length();
+  }
+}
+
 static bool IsFlat(const absl::Cord& c) {
   return c.chunk_begin() == c.chunk_end() || ++c.chunk_begin() == c.chunk_end();
 }
@@ -1274,6 +1317,26 @@ TEST(Cord, Concat_Append) {
   EXPECT_EQ(s2.size(), size + 1);
 }
 
+TEST(Cord, DiabolicalGrowth) {
+  // This test exercises a diabolical Append(<one char>) on a cord, making the
+  // cord shared before each Append call resulting in a terribly fragmented
+  // resulting cord.
+  // TODO(b/183983616): Apply some minimum compaction when copying a shared
+  // source cord into a mutable copy for updates in CordRepRing.
+  RandomEngine rng(testing::GTEST_FLAG(random_seed));
+  const std::string expected = RandomLowercaseString(&rng, 5000);
+  absl::Cord cord;
+  for (char c : expected) {
+    absl::Cord shared(cord);
+    cord.Append(absl::string_view(&c, 1));
+  }
+  std::string value;
+  absl::CopyCordToString(cord, &value);
+  EXPECT_EQ(value, expected);
+  ABSL_RAW_LOG(INFO, "Diabolical size allocated = %zu",
+               cord.EstimatedMemoryUsage());
+}
+
 TEST(MakeFragmentedCord, MakeFragmentedCordFromInitializerList) {
   absl::Cord fragmented =
       absl::MakeFragmentedCord({"A ", "fragmented ", "Cord"});
diff --git a/third_party/abseil-cpp/absl/strings/cord_test_helpers.h b/third_party/abseil-cpp/absl/strings/cord_test_helpers.h
index f1036e3b13..31a1dc8980 100644
--- a/third_party/abseil-cpp/absl/strings/cord_test_helpers.h
+++ b/third_party/abseil-cpp/absl/strings/cord_test_helpers.h
@@ -17,11 +17,73 @@
 #ifndef ABSL_STRINGS_CORD_TEST_HELPERS_H_
 #define ABSL_STRINGS_CORD_TEST_HELPERS_H_
 
+#include <cstdint>
+#include <iostream>
+#include <string>
+
+#include "absl/base/config.h"
 #include "absl/strings/cord.h"
+#include "absl/strings/internal/cord_internal.h"
+#include "absl/strings/string_view.h"
 
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 
+// Cord sizes relevant for testing
+enum class TestCordSize {
+  // An empty value
+  kEmpty = 0,
+
+  // An inlined string value
+  kInlined = cord_internal::kMaxInline / 2 + 1,
+
+  // 'Well known' SSO lengths (excluding terminating zero).
+  // libstdcxx has a maximum SSO of 15, libc++ has a maximum SSO of 22.
+  kStringSso1 = 15,
+  kStringSso2 = 22,
+
+  // A string value which is too large to fit in inlined data, but small enough
+  // such that Cord prefers copying the value if possible, i.e.: not stealing
+  // std::string inputs, or referencing existing CordReps on Append, etc.
+  kSmall = cord_internal::kMaxBytesToCopy / 2 + 1,
+
+  // A string value large enough that Cord prefers to reference or steal from
+  // existing inputs rather than copying contents of the input.
+  kMedium = cord_internal::kMaxFlatLength / 2 + 1,
+
+  // A string value large enough to cause it to be stored in mutliple flats.
+  kLarge = cord_internal::kMaxFlatLength * 4
+};
+
+// To string helper
+inline absl::string_view ToString(TestCordSize size) {
+  switch (size) {
+    case TestCordSize::kEmpty:
+      return "Empty";
+    case TestCordSize::kInlined:
+      return "Inlined";
+    case TestCordSize::kSmall:
+      return "Small";
+    case TestCordSize::kStringSso1:
+      return "StringSso1";
+    case TestCordSize::kStringSso2:
+      return "StringSso2";
+    case TestCordSize::kMedium:
+      return "Medium";
+    case TestCordSize::kLarge:
+      return "Large";
+  }
+  return "???";
+}
+
+// Returns the length matching the specified size
+inline size_t Length(TestCordSize size) { return static_cast<size_t>(size); }
+
+// Stream output helper
+inline std::ostream& operator<<(std::ostream& stream, TestCordSize size) {
+  return stream << ToString(size);
+}
+
 // Creates a multi-segment Cord from an iterable container of strings.  The
 // resulting Cord is guaranteed to have one segment for every string in the
 // container.  This allows code to be unit tested with multi-segment Cord
diff --git a/third_party/abseil-cpp/absl/strings/cordz_test.cc b/third_party/abseil-cpp/absl/strings/cordz_test.cc
new file mode 100644
index 0000000000..2b7d30b0e0
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/cordz_test.cc
@@ -0,0 +1,466 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+#include "absl/base/internal/raw_logging.h"
+#include "absl/base/macros.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/cord_test_helpers.h"
+#include "absl/strings/cordz_test_helpers.h"
+#include "absl/strings/internal/cordz_functions.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/strings/internal/cordz_sample_token.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+#ifdef ABSL_INTERNAL_CORDZ_ENABLED
+
+using testing::Eq;
+using testing::AnyOf;
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+
+using cord_internal::CordzInfo;
+using cord_internal::CordzSampleToken;
+using cord_internal::CordzStatistics;
+using cord_internal::CordzUpdateTracker;
+using Method = CordzUpdateTracker::MethodIdentifier;
+
+// Do not print cord contents, we only care about 'size' perhaps.
+// Note that this method must be inside the named namespace.
+inline void PrintTo(const Cord& cord, std::ostream* s) {
+  if (s) *s << "Cord[" << cord.size() << "]";
+}
+
+namespace {
+
+auto constexpr kMaxInline = cord_internal::kMaxInline;
+
+// Returns a string_view value of the specified length
+// We do this to avoid 'consuming' large strings in Cord by default.
+absl::string_view MakeString(size_t size) {
+  thread_local std::string str;
+  str = std::string(size, '.');
+  return str;
+}
+
+absl::string_view MakeString(TestCordSize size) {
+  return MakeString(Length(size));
+}
+
+// Returns a cord with a sampled method of kAppendString.
+absl::Cord MakeAppendStringCord(TestCordSize size) {
+  CordzSamplingIntervalHelper always(1);
+  absl::Cord cord;
+  cord.Append(MakeString(size));
+  return cord;
+}
+
+std::string TestParamToString(::testing::TestParamInfo<TestCordSize> size) {
+  return absl::StrCat("On", ToString(size.param), "Cord");
+}
+
+class CordzUpdateTest : public testing::TestWithParam<TestCordSize> {
+ public:
+  Cord& cord() { return cord_; }
+
+  Method InitialOr(Method method) const {
+    return (GetParam() > TestCordSize::kInlined) ? Method::kConstructorString
+                                                 : method;
+  }
+
+ private:
+  CordzSamplingIntervalHelper sample_every_{1};
+  Cord cord_{MakeString(GetParam())};
+};
+
+template <typename T>
+std::string ParamToString(::testing::TestParamInfo<T> param) {
+  return std::string(ToString(param.param));
+}
+
+INSTANTIATE_TEST_SUITE_P(WithParam, CordzUpdateTest,
+                         testing::Values(TestCordSize::kEmpty,
+                                         TestCordSize::kInlined,
+                                         TestCordSize::kLarge),
+                         TestParamToString);
+
+class CordzStringTest : public testing::TestWithParam<TestCordSize> {
+ private:
+  CordzSamplingIntervalHelper sample_every_{1};
+};
+
+INSTANTIATE_TEST_SUITE_P(WithParam, CordzStringTest,
+                         testing::Values(TestCordSize::kInlined,
+                                         TestCordSize::kStringSso1,
+                                         TestCordSize::kStringSso2,
+                                         TestCordSize::kSmall,
+                                         TestCordSize::kLarge),
+                         ParamToString<TestCordSize>);
+
+TEST(CordzTest, ConstructSmallArray) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord(MakeString(TestCordSize::kSmall));
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+}
+
+TEST(CordzTest, ConstructLargeArray) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord(MakeString(TestCordSize::kLarge));
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+}
+
+TEST_P(CordzStringTest, ConstructString) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord(std::string(Length(GetParam()), '.'));
+  if (Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  }
+}
+
+TEST(CordzTest, CopyConstructFromUnsampled) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  Cord cord(src);
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+}
+
+TEST(CordzTest, CopyConstructFromSampled) {
+  CordzSamplingIntervalHelper sample_never{99999};
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  Cord cord(src);
+  ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+}
+
+TEST(CordzTest, MoveConstruct) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord src(MakeString(TestCordSize::kLarge));
+  Cord cord(std::move(src));
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+}
+
+TEST_P(CordzUpdateTest, AssignUnsampledCord) {
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  const CordzInfo* info = GetCordzInfoForTesting(cord());
+  cord() = src;
+  EXPECT_THAT(GetCordzInfoForTesting(cord()), Eq(nullptr));
+  EXPECT_FALSE(CordzInfoIsListed(info));
+}
+
+TEST_P(CordzUpdateTest, AssignSampledCord) {
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  cord() = src;
+  ASSERT_THAT(cord(), HasValidCordzInfoOf(Method::kAssignCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord())->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0));
+}
+
+TEST(CordzUpdateTest, AssignSampledCordToInlined) {
+  CordzSamplingIntervalHelper sample_never{99999};
+  Cord cord;
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  cord = src;
+  ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0));
+}
+
+TEST(CordzUpdateTest, AssignSampledCordToUnsampledCord) {
+  CordzSamplingIntervalHelper sample_never{99999};
+  Cord cord = UnsampledCord(MakeString(TestCordSize::kLarge));
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  cord = src;
+  ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0));
+}
+
+TEST(CordzUpdateTest, AssignUnsampledCordToSampledCordWithoutSampling) {
+  CordzSamplingIntervalHelper sample_never{99999};
+  Cord cord = MakeAppendStringCord(TestCordSize::kLarge);
+  const CordzInfo* info = GetCordzInfoForTesting(cord);
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  cord = src;
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+  EXPECT_FALSE(CordzInfoIsListed(info));
+}
+
+TEST(CordzUpdateTest, AssignUnsampledCordToSampledCordWithSampling) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord = MakeAppendStringCord(TestCordSize::kLarge);
+  const CordzInfo* info = GetCordzInfoForTesting(cord);
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  cord = src;
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+  EXPECT_FALSE(CordzInfoIsListed(info));
+}
+
+TEST(CordzUpdateTest, AssignSampledCordToSampledCord) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  Cord cord(MakeString(TestCordSize::kLarge));
+  cord = src;
+  ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0));
+}
+
+TEST(CordzUpdateTest, AssignUnsampledCordToSampledCord) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  Cord cord(MakeString(TestCordSize::kLarge));
+  cord = src;
+  ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0));
+}
+
+TEST(CordzTest, AssignInlinedCordToSampledCord) {
+  CordzSampleToken token;
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord(MakeString(TestCordSize::kLarge));
+  const CordzInfo* info = GetCordzInfoForTesting(cord);
+  Cord src = UnsampledCord(MakeString(TestCordSize::kInlined));
+  cord = src;
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+  EXPECT_FALSE(CordzInfoIsListed(info));
+}
+
+TEST(CordzUpdateTest, MoveAssignCord) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord;
+  Cord src(MakeString(TestCordSize::kLarge));
+  cord = std::move(src);
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+}
+
+TEST_P(CordzUpdateTest, AssignLargeArray) {
+  cord() = MakeString(TestCordSize::kSmall);
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(Method::kAssignString));
+}
+
+TEST_P(CordzUpdateTest, AssignSmallArray) {
+  cord() = MakeString(TestCordSize::kSmall);
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(Method::kAssignString));
+}
+
+TEST_P(CordzUpdateTest, AssignInlinedArray) {
+  cord() = MakeString(TestCordSize::kInlined);
+  EXPECT_THAT(GetCordzInfoForTesting(cord()), Eq(nullptr));
+}
+
+TEST_P(CordzStringTest, AssignStringToInlined) {
+  Cord cord;
+  cord = std::string(Length(GetParam()), '.');
+  if (Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kAssignString));
+  }
+}
+
+TEST_P(CordzStringTest, AssignStringToCord) {
+  Cord cord(MakeString(TestCordSize::kLarge));
+  cord = std::string(Length(GetParam()), '.');
+  if (Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+    EXPECT_THAT(cord, CordzMethodCountEq(Method::kAssignString, 1));
+  }
+}
+
+TEST_P(CordzUpdateTest, AssignInlinedString) {
+  cord() = std::string(Length(TestCordSize::kInlined), '.');
+  EXPECT_THAT(GetCordzInfoForTesting(cord()), Eq(nullptr));
+}
+
+TEST_P(CordzUpdateTest, AppendCord) {
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  cord().Append(src);
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendCord)));
+}
+
+TEST_P(CordzUpdateTest, MoveAppendCord) {
+  cord().Append(UnsampledCord(MakeString(TestCordSize::kLarge)));
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendCord)));
+}
+
+TEST_P(CordzUpdateTest, AppendSmallArray) {
+  cord().Append(MakeString(TestCordSize::kSmall));
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendString)));
+}
+
+TEST_P(CordzUpdateTest, AppendLargeArray) {
+  cord().Append(MakeString(TestCordSize::kLarge));
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendString)));
+}
+
+TEST_P(CordzStringTest, AppendStringToEmpty) {
+  Cord cord;
+  cord.Append(std::string(Length(GetParam()), '.'));
+  if (Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kAppendString));
+  }
+}
+
+TEST_P(CordzStringTest, AppendStringToInlined) {
+  Cord cord(MakeString(TestCordSize::kInlined));
+  cord.Append(std::string(Length(GetParam()), '.'));
+  if (Length(TestCordSize::kInlined) + Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kAppendString));
+  }
+}
+
+TEST_P(CordzStringTest, AppendStringToCord) {
+  Cord cord(MakeString(TestCordSize::kLarge));
+  cord.Append(std::string(Length(GetParam()), '.'));
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  EXPECT_THAT(cord, CordzMethodCountEq(Method::kAppendString, 1));
+}
+
+TEST(CordzTest, MakeCordFromExternal) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord = MakeCordFromExternal("Hello world", [](absl::string_view) {});
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kMakeCordFromExternal));
+}
+
+TEST(CordzTest, MakeCordFromEmptyExternal) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord cord = MakeCordFromExternal({}, [](absl::string_view) {});
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+}
+
+TEST_P(CordzUpdateTest, PrependCord) {
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  cord().Prepend(src);
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kPrependCord)));
+}
+
+TEST_P(CordzUpdateTest, PrependSmallArray) {
+  cord().Prepend(MakeString(TestCordSize::kSmall));
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kPrependString)));
+}
+
+TEST_P(CordzUpdateTest, PrependLargeArray) {
+  cord().Prepend(MakeString(TestCordSize::kLarge));
+  EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kPrependString)));
+}
+
+TEST_P(CordzStringTest, PrependStringToEmpty) {
+  Cord cord;
+  cord.Prepend(std::string(Length(GetParam()), '.'));
+  if (Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kPrependString));
+  }
+}
+
+TEST_P(CordzStringTest, PrependStringToInlined) {
+  Cord cord(MakeString(TestCordSize::kInlined));
+  cord.Prepend(std::string(Length(GetParam()), '.'));
+  if (Length(TestCordSize::kInlined) + Length(GetParam()) > kMaxInline) {
+    EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kPrependString));
+  }
+}
+
+TEST_P(CordzStringTest, PrependStringToCord) {
+  Cord cord(MakeString(TestCordSize::kLarge));
+  cord.Prepend(std::string(Length(GetParam()), '.'));
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  EXPECT_THAT(cord, CordzMethodCountEq(Method::kPrependString, 1));
+}
+
+TEST(CordzTest, RemovePrefix) {
+  CordzSamplingIntervalHelper sample_every(1);
+  Cord cord(MakeString(TestCordSize::kLarge));
+
+  // Half the cord
+  cord.RemovePrefix(cord.size() / 2);
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemovePrefix, 1));
+
+  // TODO(mvels): RemovePrefix does not reset to inlined, except if empty?
+  cord.RemovePrefix(cord.size() - kMaxInline);
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemovePrefix, 2));
+
+  cord.RemovePrefix(cord.size());
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+}
+
+TEST(CordzTest, RemoveSuffix) {
+  CordzSamplingIntervalHelper sample_every(1);
+  Cord cord(MakeString(TestCordSize::kLarge));
+
+  // Half the cord
+  cord.RemoveSuffix(cord.size() / 2);
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemoveSuffix, 1));
+
+  // TODO(mvels): RemoveSuffix does not reset to inlined, except if empty?
+  cord.RemoveSuffix(cord.size() - kMaxInline);
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString));
+  EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemoveSuffix, 2));
+
+  cord.RemoveSuffix(cord.size());
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+}
+
+TEST(CordzTest, SubCordFromUnsampledCord) {
+  CordzSamplingIntervalHelper sample_every{1};
+  Cord src = UnsampledCord(MakeString(TestCordSize::kLarge));
+  Cord cord = src.Subcord(10, src.size() / 2);
+  EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr));
+}
+
+TEST(CordzTest, SubCordFromSampledCord) {
+  CordzSamplingIntervalHelper sample_never{99999};
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  Cord cord = src.Subcord(10, src.size() / 2);
+  ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kSubCord));
+  CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics();
+  EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString));
+  EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1));
+}
+
+TEST(CordzTest, SmallSubCord) {
+  CordzSamplingIntervalHelper sample_never{99999};
+  Cord src = MakeAppendStringCord(TestCordSize::kLarge);
+  Cord cord = src.Subcord(10, kMaxInline + 1);
+  EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kSubCord));
+}
+
+}  // namespace
+
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_INTERNAL_CORDZ_ENABLED
diff --git a/third_party/abseil-cpp/absl/strings/cordz_test_helpers.h b/third_party/abseil-cpp/absl/strings/cordz_test_helpers.h
new file mode 100644
index 0000000000..e410eecf7f
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/cordz_test_helpers.h
@@ -0,0 +1,151 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_CORDZ_TEST_HELPERS_H_
+#define ABSL_STRINGS_CORDZ_TEST_HELPERS_H_
+
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+#include "absl/base/macros.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/internal/cord_internal.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/strings/internal/cordz_sample_token.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+#include "absl/strings/str_cat.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+
+// Returns the CordzInfo for the cord, or nullptr if the cord is not sampled.
+inline const cord_internal::CordzInfo* GetCordzInfoForTesting(
+    const Cord& cord) {
+  if (!cord.contents_.is_tree()) return nullptr;
+  return cord.contents_.cordz_info();
+}
+
+// Returns true if the provided cordz_info is in the list of sampled cords.
+inline bool CordzInfoIsListed(const cord_internal::CordzInfo* cordz_info,
+                              cord_internal::CordzSampleToken token = {}) {
+  for (const cord_internal::CordzInfo& info : token) {
+    if (cordz_info == &info) return true;
+  }
+  return false;
+}
+
+// Matcher on Cord that verifies all of:
+// - the cord is sampled
+// - the CordzInfo of the cord is listed / discoverable.
+// - the reported CordzStatistics match the cord's actual properties
+// - the cord has an (initial) UpdateTracker count of 1 for `method`
+MATCHER_P(HasValidCordzInfoOf, method, "CordzInfo matches cord") {
+  const cord_internal::CordzInfo* cord_info = GetCordzInfoForTesting(arg);
+  if (cord_info == nullptr) {
+    *result_listener << "cord is not sampled";
+    return false;
+  }
+  if (!CordzInfoIsListed(cord_info)) {
+    *result_listener << "cord is sampled, but not listed";
+    return false;
+  }
+  cord_internal::CordzStatistics stat = cord_info->GetCordzStatistics();
+  if (stat.size != arg.size()) {
+    *result_listener << "cordz size " << stat.size
+                     << " does not match cord size " << arg.size();
+    return false;
+  }
+  if (stat.update_tracker.Value(method) != 1) {
+    *result_listener << "Expected method count 1 for " << method << ", found "
+                     << stat.update_tracker.Value(method);
+    return false;
+  }
+  return true;
+}
+
+// Matcher on Cord that verifies that the cord is sampled and that the CordzInfo
+// update tracker has 'method' with a call count of 'n'
+MATCHER_P2(CordzMethodCountEq, method, n,
+           absl::StrCat("CordzInfo method count equals ", n)) {
+  const cord_internal::CordzInfo* cord_info = GetCordzInfoForTesting(arg);
+  if (cord_info == nullptr) {
+    *result_listener << "cord is not sampled";
+    return false;
+  }
+  cord_internal::CordzStatistics stat = cord_info->GetCordzStatistics();
+  if (stat.update_tracker.Value(method) != n) {
+    *result_listener << "Expected method count " << n << " for " << method
+                     << ", found " << stat.update_tracker.Value(method);
+    return false;
+  }
+  return true;
+}
+
+// Cordz will only update with a new rate once the previously scheduled event
+// has fired. When we disable Cordz, a long delay takes place where we won't
+// consider profiling new Cords. CordzSampleIntervalHelper will burn through
+// that interval and allow for testing that assumes that the average sampling
+// interval is a particular value.
+class CordzSamplingIntervalHelper {
+ public:
+  explicit CordzSamplingIntervalHelper(int32_t interval)
+      : orig_mean_interval_(absl::cord_internal::get_cordz_mean_interval()) {
+    absl::cord_internal::set_cordz_mean_interval(interval);
+    absl::cord_internal::cordz_set_next_sample_for_testing(interval);
+  }
+
+  ~CordzSamplingIntervalHelper() {
+    absl::cord_internal::set_cordz_mean_interval(orig_mean_interval_);
+    absl::cord_internal::cordz_set_next_sample_for_testing(orig_mean_interval_);
+  }
+
+ private:
+  int32_t orig_mean_interval_;
+};
+
+// Wrapper struct managing a small CordRep `rep`
+struct TestCordRep {
+  cord_internal::CordRepFlat* rep;
+
+  TestCordRep() {
+    rep = cord_internal::CordRepFlat::New(100);
+    rep->length = 100;
+    memset(rep->Data(), 1, 100);
+  }
+  ~TestCordRep() { cord_internal::CordRep::Unref(rep); }
+};
+
+// Wrapper struct managing a small CordRep `rep`, and
+// an InlineData `data` initialized with that CordRep.
+struct TestCordData {
+  TestCordRep rep;
+  cord_internal::InlineData data{rep.rep};
+};
+
+// Creates a Cord that is not sampled
+template <typename... Args>
+Cord UnsampledCord(Args... args) {
+  CordzSamplingIntervalHelper never(9999);
+  Cord cord(std::forward<Args>(args)...);
+  ABSL_ASSERT(GetCordzInfoForTesting(cord) == nullptr);
+  return cord;
+}
+
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_CORDZ_TEST_HELPERS_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc b/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc
index 8b11868c88..d29acaf462 100644
--- a/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc
@@ -52,7 +52,7 @@ static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
 
 // The lowest valued 19-digit decimal mantissa we can read still contains
 // sufficient information to reconstruct a binary mantissa.
-static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
+static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above");
 
 // ParseFloat<16> will read the first 15 significant digits of the mantissa.
 //
diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_internal.h b/third_party/abseil-cpp/absl/strings/internal/cord_internal.h
index a1ba67fec3..813b3f3527 100644
--- a/third_party/abseil-cpp/absl/strings/internal/cord_internal.h
+++ b/third_party/abseil-cpp/absl/strings/internal/cord_internal.h
@@ -329,18 +329,17 @@ static constexpr cordz_info_t BigEndianByte(unsigned char value) {
 
 class InlineData {
  public:
+  // DefaultInitType forces the use of the default initialization constructor.
+  enum DefaultInitType { kDefaultInit };
+
   // kNullCordzInfo holds the big endian representation of intptr_t(1)
   // This is the 'null' / initial value of 'cordz_info'. The null value
   // is specifically big endian 1 as with 64-bit pointers, the last
   // byte of cordz_info overlaps with the last byte holding the tag.
   static constexpr cordz_info_t kNullCordzInfo = BigEndianByte(1);
 
-  // kFakeCordzInfo holds a 'fake', non-null cordz-info value we use to
-  // emulate the previous 'kProfiled' tag logic in 'set_profiled' until
-  // cord code is changed to store cordz_info values in InlineData.
-  static constexpr cordz_info_t kFakeCordzInfo = BigEndianByte(9);
-
   constexpr InlineData() : as_chars_{0} {}
+  explicit InlineData(DefaultInitType) {}
   explicit constexpr InlineData(CordRep* rep) : as_tree_(rep) {}
   explicit constexpr InlineData(absl::string_view chars)
       : as_chars_{
@@ -367,6 +366,16 @@ class InlineData {
     return as_tree_.cordz_info != kNullCordzInfo;
   }
 
+  // Returns true if either of the provided instances hold a cordz_info value.
+  // This method is more efficient than the equivalent `data1.is_profiled() ||
+  // data2.is_profiled()`. Requires both arguments to hold a tree.
+  static bool is_either_profiled(const InlineData& data1,
+                                 const InlineData& data2) {
+    assert(data1.is_tree() && data2.is_tree());
+    return (data1.as_tree_.cordz_info | data2.as_tree_.cordz_info) !=
+           kNullCordzInfo;
+  }
+
   // Returns the cordz_info sampling instance for this instance, or nullptr
   // if the current instance is not sampled and does not have CordzInfo data.
   // Requires the current instance to hold a tree value.
@@ -454,13 +463,6 @@ class InlineData {
     tag() = static_cast<char>(size << 1);
   }
 
-  // Sets or unsets the 'is_profiled' state of this instance.
-  // Requires the current instance to hold a tree value.
-  void set_profiled(bool profiled) {
-    assert(is_tree());
-    as_tree_.cordz_info = profiled ? kFakeCordzInfo : kNullCordzInfo;
-  }
-
  private:
   // See cordz_info_t for forced alignment and size of `cordz_info` details.
   struct AsTree {
diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc
index 4d31d1d97c..f78c94e19b 100644
--- a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc
@@ -32,15 +32,6 @@ namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace cord_internal {
 
-// See https://bugs.llvm.org/show_bug.cgi?id=48477
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wshadow"
-#if __has_warning("-Wshadow-field")
-#pragma clang diagnostic ignored "-Wshadow-field"
-#endif
-#endif
-
 namespace {
 
 using index_type = CordRepRing::index_type;
@@ -301,7 +292,7 @@ bool CordRepRing::IsValid(std::ostream& output) const {
     if (offset >= child->length || entry_length > child->length - offset) {
       output << "entry[" << head << "] has offset " << offset
              << " and entry length " << entry_length
-             << " which are outside of the childs length of " << child->length;
+             << " which are outside of the child's length of " << child->length;
       return false;
     }
 
@@ -400,10 +391,11 @@ CordRepRing* CordRepRing::Mutable(CordRepRing* rep, size_t extra) {
   // Get current number of entries, and check for max capacity.
   size_t entries = rep->entries();
 
-  size_t min_extra = (std::max)(extra, rep->capacity() * 2 - entries);
   if (!rep->refcount.IsOne()) {
-    return Copy(rep, rep->head(), rep->tail(), min_extra);
+    return Copy(rep, rep->head(), rep->tail(), extra);
   } else if (entries + extra > rep->capacity()) {
+    const size_t min_grow = rep->capacity() + rep->capacity() / 2;
+    const size_t min_extra = (std::max)(extra, min_grow - entries);
     CordRepRing* newrep = CordRepRing::New(entries, min_extra);
     newrep->Fill<false>(rep, rep->head(), rep->tail());
     CordRepRing::Delete(rep);
@@ -449,12 +441,12 @@ Span<char> CordRepRing::GetPrependBuffer(size_t size) {
 }
 
 CordRepRing* CordRepRing::CreateFromLeaf(CordRep* child, size_t offset,
-                                         size_t length, size_t extra) {
+                                         size_t len, size_t extra) {
   CordRepRing* rep = CordRepRing::New(1, extra);
   rep->head_ = 0;
   rep->tail_ = rep->advance(0);
-  rep->length = length;
-  rep->entry_end_pos()[0] = length;
+  rep->length = len;
+  rep->entry_end_pos()[0] = len;
   rep->entry_child()[0] = child;
   rep->entry_data_offset()[0] = static_cast<offset_type>(offset);
   return Validate(rep);
@@ -462,16 +454,16 @@ CordRepRing* CordRepRing::CreateFromLeaf(CordRep* child, size_t offset,
 
 CordRepRing* CordRepRing::CreateSlow(CordRep* child, size_t extra) {
   CordRepRing* rep = nullptr;
-  Consume(child, [&](CordRep* child, size_t offset, size_t length) {
-    if (IsFlatOrExternal(child)) {
-      rep = rep ? AppendLeaf(rep, child, offset, length)
-                : CreateFromLeaf(child, offset, length, extra);
+  Consume(child, [&](CordRep* child_arg, size_t offset, size_t len) {
+    if (IsFlatOrExternal(child_arg)) {
+      rep = rep ? AppendLeaf(rep, child_arg, offset, len)
+                : CreateFromLeaf(child_arg, offset, len, extra);
     } else if (rep) {
-      rep = AddRing<AddMode::kAppend>(rep, child->ring(), offset, length);
-    } else if (offset == 0 && child->length == length) {
-      rep = Mutable(child->ring(), extra);
+      rep = AddRing<AddMode::kAppend>(rep, child_arg->ring(), offset, len);
+    } else if (offset == 0 && child_arg->length == len) {
+      rep = Mutable(child_arg->ring(), extra);
     } else {
-      rep = SubRing(child->ring(), offset, length, extra);
+      rep = SubRing(child_arg->ring(), offset, len, extra);
     }
   });
   return Validate(rep, nullptr, __LINE__);
@@ -490,18 +482,18 @@ CordRepRing* CordRepRing::Create(CordRep* child, size_t extra) {
 
 template <CordRepRing::AddMode mode>
 CordRepRing* CordRepRing::AddRing(CordRepRing* rep, CordRepRing* ring,
-                                  size_t offset, size_t length) {
+                                  size_t offset, size_t len) {
   assert(offset < ring->length);
   constexpr bool append = mode == AddMode::kAppend;
   Position head = ring->Find(offset);
-  Position tail = ring->FindTail(head.index, offset + length);
+  Position tail = ring->FindTail(head.index, offset + len);
   const index_type entries = ring->entries(head.index, tail.index);
 
   rep = Mutable(rep, entries);
 
   // The delta for making ring[head].end_pos into 'len - offset'
   const pos_type delta_length =
-      (append ? rep->begin_pos_ + rep->length : rep->begin_pos_ - length) -
+      (append ? rep->begin_pos_ + rep->length : rep->begin_pos_ - len) -
       ring->entry_begin_pos(head.index) - head.offset;
 
   // Start filling at `tail`, or `entries` before `head`
@@ -542,36 +534,36 @@ CordRepRing* CordRepRing::AddRing(CordRepRing* rep, CordRepRing* ring,
   }
 
   // Commit changes
-  rep->length += length;
+  rep->length += len;
   if (append) {
     rep->tail_ = filler.pos();
   } else {
     rep->head_ = filler.head();
-    rep->begin_pos_ -= length;
+    rep->begin_pos_ -= len;
   }
 
   return Validate(rep);
 }
 
 CordRepRing* CordRepRing::AppendSlow(CordRepRing* rep, CordRep* child) {
-  Consume(child, [&rep](CordRep* child, size_t offset, size_t length) {
-    if (child->tag == RING) {
-      rep = AddRing<AddMode::kAppend>(rep, child->ring(), offset, length);
+  Consume(child, [&rep](CordRep* child_arg, size_t offset, size_t len) {
+    if (child_arg->tag == RING) {
+      rep = AddRing<AddMode::kAppend>(rep, child_arg->ring(), offset, len);
     } else {
-      rep = AppendLeaf(rep, child, offset, length);
+      rep = AppendLeaf(rep, child_arg, offset, len);
     }
   });
   return rep;
 }
 
 CordRepRing* CordRepRing::AppendLeaf(CordRepRing* rep, CordRep* child,
-                                     size_t offset, size_t length) {
+                                     size_t offset, size_t len) {
   rep = Mutable(rep, 1);
   index_type back = rep->tail_;
   const pos_type begin_pos = rep->begin_pos_ + rep->length;
   rep->tail_ = rep->advance(rep->tail_);
-  rep->length += length;
-  rep->entry_end_pos()[back] = begin_pos + length;
+  rep->length += len;
+  rep->entry_end_pos()[back] = begin_pos + len;
   rep->entry_child()[back] = child;
   rep->entry_data_offset()[back] = static_cast<offset_type>(offset);
   return Validate(rep, nullptr, __LINE__);
@@ -589,24 +581,24 @@ CordRepRing* CordRepRing::Append(CordRepRing* rep, CordRep* child) {
 }
 
 CordRepRing* CordRepRing::PrependSlow(CordRepRing* rep, CordRep* child) {
-  RConsume(child, [&](CordRep* child, size_t offset, size_t length) {
-    if (IsFlatOrExternal(child)) {
-      rep = PrependLeaf(rep, child, offset, length);
+  RConsume(child, [&](CordRep* child_arg, size_t offset, size_t len) {
+    if (IsFlatOrExternal(child_arg)) {
+      rep = PrependLeaf(rep, child_arg, offset, len);
     } else {
-      rep = AddRing<AddMode::kPrepend>(rep, child->ring(), offset, length);
+      rep = AddRing<AddMode::kPrepend>(rep, child_arg->ring(), offset, len);
     }
   });
   return Validate(rep);
 }
 
 CordRepRing* CordRepRing::PrependLeaf(CordRepRing* rep, CordRep* child,
-                                      size_t offset, size_t length) {
+                                      size_t offset, size_t len) {
   rep = Mutable(rep, 1);
   index_type head = rep->retreat(rep->head_);
   pos_type end_pos = rep->begin_pos_;
   rep->head_ = head;
-  rep->length += length;
-  rep->begin_pos_ -= length;
+  rep->length += len;
+  rep->begin_pos_ -= len;
   rep->entry_end_pos()[head] = end_pos;
   rep->entry_child()[head] = child;
   rep->entry_data_offset()[head] = static_cast<offset_type>(offset);
@@ -786,18 +778,18 @@ char CordRepRing::GetCharacter(size_t offset) const {
 }
 
 CordRepRing* CordRepRing::SubRing(CordRepRing* rep, size_t offset,
-                                  size_t length, size_t extra) {
+                                  size_t len, size_t extra) {
   assert(offset <= rep->length);
-  assert(offset <= rep->length - length);
+  assert(offset <= rep->length - len);
 
-  if (length == 0) {
+  if (len == 0) {
     CordRep::Unref(rep);
     return nullptr;
   }
 
   // Find position of first byte
   Position head = rep->Find(offset);
-  Position tail = rep->FindTail(head.index, offset + length);
+  Position tail = rep->FindTail(head.index, offset + len);
   const size_t new_entries = rep->entries(head.index, tail.index);
 
   if (rep->refcount.IsOne() && extra <= (rep->capacity() - new_entries)) {
@@ -814,7 +806,7 @@ CordRepRing* CordRepRing::SubRing(CordRepRing* rep, size_t offset,
   }
 
   // Adjust begin_pos and length
-  rep->length = length;
+  rep->length = len;
   rep->begin_pos_ += offset;
 
   // Adjust head and tail blocks
@@ -888,10 +880,6 @@ CordRepRing* CordRepRing::RemoveSuffix(CordRepRing* rep, size_t len,
   return Validate(rep);
 }
 
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-
 }  // namespace cord_internal
 ABSL_NAMESPACE_END
 }  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h
index c74d3353ff..2082a5653f 100644
--- a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h
+++ b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h
@@ -30,15 +30,6 @@ namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace cord_internal {
 
-// See https://bugs.llvm.org/show_bug.cgi?id=48477
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wshadow"
-#if __has_warning("-Wshadow-field")
-#pragma clang diagnostic ignored "-Wshadow-field"
-#endif
-#endif
-
 // All operations modifying a ring buffer are implemented as static methods
 // requiring a CordRepRing instance with a reference adopted by the method.
 //
@@ -210,23 +201,23 @@ class CordRepRing : public CordRep {
   // referencing up to `size` capacity directly before the existing data.
   Span<char> GetPrependBuffer(size_t size);
 
-  // Returns a cord ring buffer containing `length` bytes of data starting at
+  // Returns a cord ring buffer containing `len` bytes of data starting at
   // `offset`. If the input is not shared, this function will remove all head
   // and tail child nodes outside of the requested range, and adjust the new
   // head and tail nodes as required. If the input is shared, this function
   // returns a new instance sharing some or all of the nodes from the input.
-  static CordRepRing* SubRing(CordRepRing* r, size_t offset, size_t length,
+  static CordRepRing* SubRing(CordRepRing* r, size_t offset, size_t len,
                               size_t extra = 0);
 
-  // Returns a cord ring buffer with the first `length` bytes removed.
+  // Returns a cord ring buffer with the first `len` bytes removed.
   // If the input is not shared, this function will remove all head child nodes
   // fully inside the first `length` bytes, and adjust the new head as required.
   // If the input is shared, this function returns a new instance sharing some
   // or all of the nodes from the input.
-  static CordRepRing* RemoveSuffix(CordRepRing* r, size_t length,
+  static CordRepRing* RemoveSuffix(CordRepRing* r, size_t len,
                                    size_t extra = 0);
 
-  // Returns a cord ring buffer with the last `length` bytes removed.
+  // Returns a cord ring buffer with the last `len` bytes removed.
   // If the input is not shared, this function will remove all head child nodes
   // fully inside the first `length` bytes, and adjust the new head as required.
   // If the input is shared, this function returns a new instance sharing some
@@ -237,6 +228,18 @@ class CordRepRing : public CordRep {
   // Returns the character at `offset`. Requires that `offset < length`.
   char GetCharacter(size_t offset) const;
 
+  // Returns true if this instance manages a single contiguous buffer, in which
+  // case the (optional) output parameter `fragment` is set. Otherwise, the
+  // function returns false, and `fragment` is left unchanged.
+  bool IsFlat(absl::string_view* fragment) const;
+
+  // Returns true if the data starting at `offset` with length `len` is
+  // managed by this instance inside a single contiguous buffer, in which case
+  // the (optional) output parameter `fragment` is set to the contiguous memory
+  // starting at offset `offset` with length `length`. Otherwise, the function
+  // returns false, and `fragment` is left unchanged.
+  bool IsFlat(size_t offset, size_t len, absl::string_view* fragment) const;
+
   // Testing only: set capacity to requested capacity.
   void SetCapacityForTesting(size_t capacity);
 
@@ -461,10 +464,10 @@ class CordRepRing : public CordRep {
                                      size_t length, size_t extra);
 
   // Appends or prepends (depending on AddMode) the ring buffer in `ring' to
-  // `rep` starting at `offset` with length `length`.
+  // `rep` starting at `offset` with length `len`.
   template <AddMode mode>
   static CordRepRing* AddRing(CordRepRing* rep, CordRepRing* ring,
-                              size_t offset, size_t length);
+                              size_t offset, size_t len);
 
   // Increases the data offset for entry `index` by `n`.
   void AddDataOffset(index_type index, size_t n);
@@ -576,11 +579,26 @@ inline const CordRepRing* CordRep::ring() const {
   return static_cast<const CordRepRing*>(this);
 }
 
-std::ostream& operator<<(std::ostream& s, const CordRepRing& rep);
+inline bool CordRepRing::IsFlat(absl::string_view* fragment) const {
+  if (entries() == 1) {
+    if (fragment) *fragment = entry_data(head());
+    return true;
+  }
+  return false;
+}
 
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
+inline bool CordRepRing::IsFlat(size_t offset, size_t len,
+                                absl::string_view* fragment) const {
+  const Position pos = Find(offset);
+  const absl::string_view data = entry_data(pos.index);
+  if (data.length() >= len && data.length() - len >= pos.offset) {
+    if (fragment) *fragment = data.substr(pos.offset, len);
+    return true;
+  }
+  return false;
+}
+
+std::ostream& operator<<(std::ostream& s, const CordRepRing& rep);
 
 }  // namespace cord_internal
 ABSL_NAMESPACE_END
diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h
index 396c0e2cd8..7ceeaa000e 100644
--- a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h
+++ b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h
@@ -40,6 +40,10 @@ class CordRepRingReader {
   // The returned value is undefined if this instance is empty.
   CordRepRing::index_type index() const { return index_; }
 
+  // Returns the current node inside the ring buffer for this instance.
+  // The returned value is undefined if this instance is empty.
+  CordRep* node() const { return ring_->entry_child(index_); }
+
   // Returns the length of the referenced ring buffer.
   // Requires the current instance to be non empty.
   size_t length() const {
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_functions.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.cc
new file mode 100644
index 0000000000..f30080f8c2
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.cc
@@ -0,0 +1,110 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_functions.h"
+
+#include <atomic>
+#include <cmath>
+#include <limits>
+#include <random>
+
+#include "absl/base/attributes.h"
+#include "absl/base/config.h"
+#include "absl/base/internal/exponential_biased.h"
+#include "absl/base/internal/raw_logging.h"
+
+// TODO(b/162942788): weak 'cordz_disabled' value.
+// A strong version is in the 'cordz_disabled_hack_for_odr' library which can
+// be linked in to disable cordz at compile time.
+extern "C" {
+bool absl_internal_cordz_disabled ABSL_ATTRIBUTE_WEAK = false;
+}
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+namespace {
+
+// The average interval until the next sample. A value of 0 disables profiling
+// while a value of 1 will profile all Cords.
+std::atomic<int> g_cordz_mean_interval(50000);
+
+}  // namespace
+
+#ifdef ABSL_INTERNAL_CORDZ_ENABLED
+
+// Special negative 'not initialized' per thread value for cordz_next_sample.
+static constexpr int64_t kInitCordzNextSample = -1;
+
+ABSL_CONST_INIT thread_local int64_t cordz_next_sample = kInitCordzNextSample;
+
+// kIntervalIfDisabled is the number of profile-eligible events need to occur
+// before the code will confirm that cordz is still disabled.
+constexpr int64_t kIntervalIfDisabled = 1 << 16;
+
+ABSL_ATTRIBUTE_NOINLINE bool cordz_should_profile_slow() {
+  // TODO(b/162942788): check if profiling is disabled at compile time.
+  if (absl_internal_cordz_disabled) {
+    ABSL_RAW_LOG(WARNING, "Cordz info disabled at compile time");
+    // We are permanently disabled: set counter to highest possible value.
+    cordz_next_sample = std::numeric_limits<int64_t>::max();
+    return false;
+  }
+
+  thread_local absl::base_internal::ExponentialBiased
+      exponential_biased_generator;
+  int32_t mean_interval = get_cordz_mean_interval();
+
+  // Check if we disabled profiling. If so, set the next sample to a "large"
+  // number to minimize the overhead of the should_profile codepath.
+  if (mean_interval <= 0) {
+    cordz_next_sample = kIntervalIfDisabled;
+    return false;
+  }
+
+  // Check if we're always sampling.
+  if (mean_interval == 1) {
+    cordz_next_sample = 1;
+    return true;
+  }
+
+  if (cordz_next_sample <= 0) {
+    // If first check on current thread, check cordz_should_profile()
+    // again using the created (initial) stride in cordz_next_sample.
+    const bool initialized = cordz_next_sample != kInitCordzNextSample;
+    cordz_next_sample = exponential_biased_generator.GetStride(mean_interval);
+    return initialized || cordz_should_profile();
+  }
+
+  --cordz_next_sample;
+  return false;
+}
+
+void cordz_set_next_sample_for_testing(int64_t next_sample) {
+  cordz_next_sample = next_sample;
+}
+
+#endif  // ABSL_INTERNAL_CORDZ_ENABLED
+
+int32_t get_cordz_mean_interval() {
+  return g_cordz_mean_interval.load(std::memory_order_acquire);
+}
+
+void set_cordz_mean_interval(int32_t mean_interval) {
+  g_cordz_mean_interval.store(mean_interval, std::memory_order_release);
+}
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_functions.h b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.h
new file mode 100644
index 0000000000..c9ba14508a
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.h
@@ -0,0 +1,85 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_CORDZ_FUNCTIONS_H_
+#define ABSL_STRINGS_CORDZ_FUNCTIONS_H_
+
+#include <stdint.h>
+
+#include "absl/base/attributes.h"
+#include "absl/base/config.h"
+#include "absl/base/optimization.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// Returns the current sample rate. This represents the average interval
+// between samples.
+int32_t get_cordz_mean_interval();
+
+// Sets the sample rate with the average interval between samples.
+void set_cordz_mean_interval(int32_t mean_interval);
+
+// Enable cordz unless any of the following applies:
+// - no thread local support
+// - MSVC build
+// - Android build
+// - Apple build
+// - DLL build
+// Hashtablez is turned off completely in opensource builds.
+// MSVC's static atomics are dynamically initialized in debug mode, which breaks
+// sampling.
+#if defined(ABSL_HAVE_THREAD_LOCAL) && !defined(_MSC_VER)  && \
+    !defined(ABSL_BUILD_DLL) && !defined(ABSL_CONSUME_DLL) && \
+    !defined(__ANDROID__) && !defined(__APPLE__)
+#define ABSL_INTERNAL_CORDZ_ENABLED 1
+#endif
+
+#ifdef ABSL_INTERNAL_CORDZ_ENABLED
+
+// cordz_next_sample is the number of events until the next sample event. If
+// the value is 1 or less, the code will check on the next event if cordz is
+// enabled, and if so, will sample the Cord. cordz is only enabled when we can
+// use thread locals.
+ABSL_CONST_INIT extern thread_local int64_t cordz_next_sample;
+
+// Determines if the next sample should be profiled. If it is, the value pointed
+// at by next_sample will be set with the interval until the next sample.
+bool cordz_should_profile_slow();
+
+// Returns true if the next cord should be sampled.
+inline bool cordz_should_profile() {
+  if (ABSL_PREDICT_TRUE(cordz_next_sample > 1)) {
+    cordz_next_sample--;
+    return false;
+  }
+  return cordz_should_profile_slow();
+}
+
+// Sets the interval until the next sample (for testing only)
+void cordz_set_next_sample_for_testing(int64_t next_sample);
+
+#else  // ABSL_INTERNAL_CORDZ_ENABLED
+
+inline bool cordz_should_profile() { return false; }
+inline void cordz_set_next_sample_for_testing(int64_t) {}
+
+#endif  // ABSL_INTERNAL_CORDZ_ENABLED
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_CORDZ_FUNCTIONS_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_functions_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_functions_test.cc
new file mode 100644
index 0000000000..350623c1f3
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_functions_test.cc
@@ -0,0 +1,149 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_functions.h"
+
+#include <thread>  // NOLINT we need real clean new threads
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Le;
+
+TEST(CordzFunctionsTest, SampleRate) {
+  int32_t orig_sample_rate = get_cordz_mean_interval();
+  int32_t expected_sample_rate = 123;
+  set_cordz_mean_interval(expected_sample_rate);
+  EXPECT_THAT(get_cordz_mean_interval(), Eq(expected_sample_rate));
+  set_cordz_mean_interval(orig_sample_rate);
+}
+
+// Cordz is disabled when we don't have thread_local. All calls to
+// should_profile will return false when cordz is diabled, so we might want to
+// avoid those tests.
+#ifdef ABSL_INTERNAL_CORDZ_ENABLED
+
+TEST(CordzFunctionsTest, ShouldProfileDisable) {
+  int32_t orig_sample_rate = get_cordz_mean_interval();
+
+  set_cordz_mean_interval(0);
+  cordz_set_next_sample_for_testing(0);
+  EXPECT_FALSE(cordz_should_profile());
+  // 1 << 16 is from kIntervalIfDisabled in cordz_functions.cc.
+  EXPECT_THAT(cordz_next_sample, Eq(1 << 16));
+
+  set_cordz_mean_interval(orig_sample_rate);
+}
+
+TEST(CordzFunctionsTest, ShouldProfileAlways) {
+  int32_t orig_sample_rate = get_cordz_mean_interval();
+
+  set_cordz_mean_interval(1);
+  cordz_set_next_sample_for_testing(1);
+  EXPECT_TRUE(cordz_should_profile());
+  EXPECT_THAT(cordz_next_sample, Le(1));
+
+  set_cordz_mean_interval(orig_sample_rate);
+}
+
+TEST(CordzFunctionsTest, DoesNotAlwaysSampleFirstCord) {
+  // Set large enough interval such that the chance of 'tons' of threads
+  // randomly sampling the first call is infinitely small.
+  set_cordz_mean_interval(10000);
+  int tries = 0;
+  bool sampled = false;
+  do {
+    ++tries;
+    ASSERT_THAT(tries, Le(1000));
+    std::thread thread([&sampled] {
+      sampled = cordz_should_profile();
+    });
+    thread.join();
+  } while (sampled);
+}
+
+TEST(CordzFunctionsTest, ShouldProfileRate) {
+  static constexpr int kDesiredMeanInterval = 1000;
+  static constexpr int kSamples = 10000;
+  int32_t orig_sample_rate = get_cordz_mean_interval();
+
+  set_cordz_mean_interval(kDesiredMeanInterval);
+
+  int64_t sum_of_intervals = 0;
+  for (int i = 0; i < kSamples; i++) {
+    // Setting next_sample to 0 will force cordz_should_profile to generate a
+    // new value for next_sample each iteration.
+    cordz_set_next_sample_for_testing(0);
+    cordz_should_profile();
+    sum_of_intervals += cordz_next_sample;
+  }
+
+  // The sum of independent exponential variables is an Erlang distribution,
+  // which is a gamma distribution where the shape parameter is equal to the
+  // number of summands. The distribution used for cordz_should_profile is
+  // actually floor(Exponential(1/mean)) which introduces bias. However, we can
+  // apply the squint-really-hard correction factor. That is, when mean is
+  // large, then if we squint really hard the shape of the distribution between
+  // N and N+1 looks like a uniform distribution. On average, each value for
+  // next_sample will be about 0.5 lower than we would expect from an
+  // exponential distribution. This squint-really-hard correction approach won't
+  // work when mean is smaller than about 10 but works fine when mean is 1000.
+  //
+  // We can use R to calculate a confidence interval. This
+  // shows how to generate a confidence interval with a false positive rate of
+  // one in a billion.
+  //
+  // $ R -q
+  // > mean = 1000
+  // > kSamples = 10000
+  // > errorRate = 1e-9
+  // > correction = -kSamples / 2
+  // > low = qgamma(errorRate/2, kSamples, 1/mean) + correction
+  // > high = qgamma(1 - errorRate/2, kSamples, 1/mean) + correction
+  // > low
+  // [1] 9396115
+  // > high
+  // [1] 10618100
+  EXPECT_THAT(sum_of_intervals, Ge(9396115));
+  EXPECT_THAT(sum_of_intervals, Le(10618100));
+
+  set_cordz_mean_interval(orig_sample_rate);
+}
+
+#else  // ABSL_INTERNAL_CORDZ_ENABLED
+
+TEST(CordzFunctionsTest, ShouldProfileDisabled) {
+  int32_t orig_sample_rate = get_cordz_mean_interval();
+
+  set_cordz_mean_interval(1);
+  cordz_set_next_sample_for_testing(0);
+  EXPECT_FALSE(cordz_should_profile());
+
+  set_cordz_mean_interval(orig_sample_rate);
+}
+
+#endif  // ABSL_INTERNAL_CORDZ_ENABLED
+
+}  // namespace
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_handle.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.cc
new file mode 100644
index 0000000000..a73fefed59
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.cc
@@ -0,0 +1,139 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "absl/strings/internal/cordz_handle.h"
+
+#include <atomic>
+
+#include "absl/base/internal/raw_logging.h"  // For ABSL_RAW_CHECK
+#include "absl/base/internal/spinlock.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+using ::absl::base_internal::SpinLockHolder;
+
+ABSL_CONST_INIT CordzHandle::Queue CordzHandle::global_queue_(absl::kConstInit);
+
+CordzHandle::CordzHandle(bool is_snapshot) : is_snapshot_(is_snapshot) {
+  if (is_snapshot) {
+    SpinLockHolder lock(&queue_->mutex);
+    CordzHandle* dq_tail = queue_->dq_tail.load(std::memory_order_acquire);
+    if (dq_tail != nullptr) {
+      dq_prev_ = dq_tail;
+      dq_tail->dq_next_ = this;
+    }
+    queue_->dq_tail.store(this, std::memory_order_release);
+  }
+}
+
+CordzHandle::~CordzHandle() {
+  ODRCheck();
+  if (is_snapshot_) {
+    std::vector<CordzHandle*> to_delete;
+    {
+      SpinLockHolder lock(&queue_->mutex);
+      CordzHandle* next = dq_next_;
+      if (dq_prev_ == nullptr) {
+        // We were head of the queue, delete every CordzHandle until we reach
+        // either the end of the list, or a snapshot handle.
+        while (next && !next->is_snapshot_) {
+          to_delete.push_back(next);
+          next = next->dq_next_;
+        }
+      } else {
+        // Another CordzHandle existed before this one, don't delete anything.
+        dq_prev_->dq_next_ = next;
+      }
+      if (next) {
+        next->dq_prev_ = dq_prev_;
+      } else {
+        queue_->dq_tail.store(dq_prev_, std::memory_order_release);
+      }
+    }
+    for (CordzHandle* handle : to_delete) {
+      delete handle;
+    }
+  }
+}
+
+bool CordzHandle::SafeToDelete() const {
+  return is_snapshot_ || queue_->IsEmpty();
+}
+
+void CordzHandle::Delete(CordzHandle* handle) {
+  assert(handle);
+  if (handle) {
+    handle->ODRCheck();
+    Queue* const queue = handle->queue_;
+    if (!handle->SafeToDelete()) {
+      SpinLockHolder lock(&queue->mutex);
+      CordzHandle* dq_tail = queue->dq_tail.load(std::memory_order_acquire);
+      if (dq_tail != nullptr) {
+        handle->dq_prev_ = dq_tail;
+        dq_tail->dq_next_ = handle;
+        queue->dq_tail.store(handle, std::memory_order_release);
+        return;
+      }
+    }
+    delete handle;
+  }
+}
+
+std::vector<const CordzHandle*> CordzHandle::DiagnosticsGetDeleteQueue() {
+  std::vector<const CordzHandle*> handles;
+  SpinLockHolder lock(&global_queue_.mutex);
+  CordzHandle* dq_tail = global_queue_.dq_tail.load(std::memory_order_acquire);
+  for (const CordzHandle* p = dq_tail; p; p = p->dq_prev_) {
+    handles.push_back(p);
+  }
+  return handles;
+}
+
+bool CordzHandle::DiagnosticsHandleIsSafeToInspect(
+    const CordzHandle* handle) const {
+  ODRCheck();
+  if (!is_snapshot_) return false;
+  if (handle == nullptr) return true;
+  if (handle->is_snapshot_) return false;
+  bool snapshot_found = false;
+  SpinLockHolder lock(&queue_->mutex);
+  for (const CordzHandle* p = queue_->dq_tail; p; p = p->dq_prev_) {
+    if (p == handle) return !snapshot_found;
+    if (p == this) snapshot_found = true;
+  }
+  ABSL_ASSERT(snapshot_found);  // Assert that 'this' is in delete queue.
+  return true;
+}
+
+std::vector<const CordzHandle*>
+CordzHandle::DiagnosticsGetSafeToInspectDeletedHandles() {
+  ODRCheck();
+  std::vector<const CordzHandle*> handles;
+  if (!is_snapshot()) {
+    return handles;
+  }
+
+  SpinLockHolder lock(&queue_->mutex);
+  for (const CordzHandle* p = dq_next_; p != nullptr; p = p->dq_next_) {
+    if (!p->is_snapshot()) {
+      handles.push_back(p);
+    }
+  }
+  return handles;
+}
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_handle.h b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.h
new file mode 100644
index 0000000000..5df53c782a
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.h
@@ -0,0 +1,131 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_CORDZ_HANDLE_H_
+#define ABSL_STRINGS_CORDZ_HANDLE_H_
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/config.h"
+#include "absl/base/internal/raw_logging.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/synchronization/mutex.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// This base class allows multiple types of object (CordzInfo and
+// CordzSampleToken) to exist simultaneously on the delete queue (pointed to by
+// global_dq_tail and traversed using dq_prev_ and dq_next_). The
+// delete queue guarantees that once a profiler creates a CordzSampleToken and
+// has gained visibility into a CordzInfo object, that CordzInfo object will not
+// be deleted prematurely. This allows the profiler to inspect all CordzInfo
+// objects that are alive without needing to hold a global lock.
+class CordzHandle {
+ public:
+  CordzHandle() : CordzHandle(false) {}
+
+  bool is_snapshot() const { return is_snapshot_; }
+
+  // Returns true if this instance is safe to be deleted because it is either a
+  // snapshot, which is always safe to delete, or not included in the global
+  // delete queue and thus not included in any snapshot.
+  // Callers are responsible for making sure this instance can not be newly
+  // discovered by other threads. For example, CordzInfo instances first de-list
+  // themselves from the global CordzInfo list before determining if they are
+  // safe to be deleted directly.
+  // If SafeToDelete returns false, callers MUST use the Delete() method to
+  // safely queue CordzHandle instances for deletion.
+  bool SafeToDelete() const;
+
+  // Deletes the provided instance, or puts it on the delete queue to be deleted
+  // once there are no more sample tokens (snapshot) instances potentially
+  // referencing the instance. `handle` should not be null.
+  static void Delete(CordzHandle* handle);
+
+  // Returns the current entries in the delete queue in LIFO order.
+  static std::vector<const CordzHandle*> DiagnosticsGetDeleteQueue();
+
+  // Returns true if the provided handle is nullptr or guarded by this handle.
+  // Since the CordzSnapshot token is itself a CordzHandle, this method will
+  // allow tests to check if that token is keeping an arbitrary CordzHandle
+  // alive.
+  bool DiagnosticsHandleIsSafeToInspect(const CordzHandle* handle) const;
+
+  // Returns the current entries in the delete queue, in LIFO order, that are
+  // protected by this. CordzHandle objects are only placed on the delete queue
+  // after CordzHandle::Delete is called with them as an argument. Only
+  // CordzHandle objects that are not also CordzSnapshot objects will be
+  // included in the return vector. For each of the handles in the return
+  // vector, the earliest that their memory can be freed is when this
+  // CordzSnapshot object is deleted.
+  std::vector<const CordzHandle*> DiagnosticsGetSafeToInspectDeletedHandles();
+
+ protected:
+  explicit CordzHandle(bool is_snapshot);
+  virtual ~CordzHandle();
+
+ private:
+  // Global queue data. CordzHandle stores a pointer to the global queue
+  // instance to harden against ODR violations.
+  struct Queue {
+    constexpr explicit Queue(absl::ConstInitType)
+        : mutex(absl::kConstInit,
+                absl::base_internal::SCHEDULE_COOPERATIVE_AND_KERNEL) {}
+
+    absl::base_internal::SpinLock mutex;
+    std::atomic<CordzHandle*> dq_tail ABSL_GUARDED_BY(mutex){nullptr};
+
+    // Returns true if this delete queue is empty. This method does not acquire
+    // the lock, but does a 'load acquire' observation on the delete queue tail.
+    // It is used inside Delete() to check for the presence of a delete queue
+    // without holding the lock. The assumption is that the caller is in the
+    // state of 'being deleted', and can not be newly discovered by a concurrent
+    // 'being constructed' snapshot instance. Practically, this means that any
+    // such discovery (`find`, 'first' or 'next', etc) must have proper 'happens
+    // before / after' semantics and atomic fences.
+    bool IsEmpty() const ABSL_NO_THREAD_SAFETY_ANALYSIS {
+      return dq_tail.load(std::memory_order_acquire) == nullptr;
+    }
+  };
+
+  void ODRCheck() const {
+#ifndef NDEBUG
+    ABSL_RAW_CHECK(queue_ == &global_queue_, "ODR violation in Cord");
+#endif
+  }
+
+  ABSL_CONST_INIT static Queue global_queue_;
+  Queue* const queue_ = &global_queue_;
+  const bool is_snapshot_;
+
+  // dq_prev_ and dq_next_ require the global queue mutex to be held.
+  // Unfortunately we can't use thread annotations such that the thread safety
+  // analysis understands that queue_ and global_queue_ are one and the same.
+  CordzHandle* dq_prev_  = nullptr;
+  CordzHandle* dq_next_ = nullptr;
+};
+
+class CordzSnapshot : public CordzHandle {
+ public:
+  CordzSnapshot() : CordzHandle(true) {}
+};
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_CORDZ_HANDLE_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_handle_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_handle_test.cc
new file mode 100644
index 0000000000..fd68e06b3e
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_handle_test.cc
@@ -0,0 +1,265 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "absl/strings/internal/cordz_handle.h"
+
+#include <random>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/synchronization/internal/thread_pool.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Gt;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+// Local less verbose helper
+std::vector<const CordzHandle*> DeleteQueue() {
+  return CordzHandle::DiagnosticsGetDeleteQueue();
+}
+
+struct CordzHandleDeleteTracker : public CordzHandle {
+  bool* deleted;
+  explicit CordzHandleDeleteTracker(bool* deleted) : deleted(deleted) {}
+  ~CordzHandleDeleteTracker() override { *deleted = true; }
+};
+
+TEST(CordzHandleTest, DeleteQueueIsEmpty) {
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+}
+
+TEST(CordzHandleTest, CordzHandleCreateDelete) {
+  bool deleted = false;
+  auto* handle = new CordzHandleDeleteTracker(&deleted);
+  EXPECT_FALSE(handle->is_snapshot());
+  EXPECT_TRUE(handle->SafeToDelete());
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+
+  CordzHandle::Delete(handle);
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+  EXPECT_TRUE(deleted);
+}
+
+TEST(CordzHandleTest, CordzSnapshotCreateDelete) {
+  auto* snapshot = new CordzSnapshot();
+  EXPECT_TRUE(snapshot->is_snapshot());
+  EXPECT_TRUE(snapshot->SafeToDelete());
+  EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot));
+  delete snapshot;
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+}
+
+TEST(CordzHandleTest, CordzHandleCreateDeleteWithSnapshot) {
+  bool deleted = false;
+  auto* snapshot = new CordzSnapshot();
+  auto* handle = new CordzHandleDeleteTracker(&deleted);
+  EXPECT_FALSE(handle->SafeToDelete());
+
+  CordzHandle::Delete(handle);
+  EXPECT_THAT(DeleteQueue(), ElementsAre(handle, snapshot));
+  EXPECT_FALSE(deleted);
+  EXPECT_FALSE(handle->SafeToDelete());
+
+  delete snapshot;
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+  EXPECT_TRUE(deleted);
+}
+
+TEST(CordzHandleTest, MultiSnapshot) {
+  bool deleted[3] = {false, false, false};
+
+  CordzSnapshot* snapshot[3];
+  CordzHandleDeleteTracker* handle[3];
+  for (int i = 0; i < 3; ++i) {
+    snapshot[i] = new CordzSnapshot();
+    handle[i] = new CordzHandleDeleteTracker(&deleted[i]);
+    CordzHandle::Delete(handle[i]);
+  }
+
+  EXPECT_THAT(DeleteQueue(), ElementsAre(handle[2], snapshot[2], handle[1],
+                                         snapshot[1], handle[0], snapshot[0]));
+  EXPECT_THAT(deleted, ElementsAre(false, false, false));
+
+  delete snapshot[1];
+  EXPECT_THAT(DeleteQueue(), ElementsAre(handle[2], snapshot[2], handle[1],
+                                         handle[0], snapshot[0]));
+  EXPECT_THAT(deleted, ElementsAre(false, false, false));
+
+  delete snapshot[0];
+  EXPECT_THAT(DeleteQueue(), ElementsAre(handle[2], snapshot[2]));
+  EXPECT_THAT(deleted, ElementsAre(true, true, false));
+
+  delete snapshot[2];
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+  EXPECT_THAT(deleted, ElementsAre(true, true, deleted));
+}
+
+TEST(CordzHandleTest, DiagnosticsHandleIsSafeToInspect) {
+  CordzSnapshot snapshot1;
+  EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(nullptr));
+
+  auto* handle1 = new CordzHandle();
+  EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1));
+
+  CordzHandle::Delete(handle1);
+  EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1));
+
+  CordzSnapshot snapshot2;
+  auto* handle2 = new CordzHandle();
+  EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1));
+  EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle2));
+  EXPECT_FALSE(snapshot2.DiagnosticsHandleIsSafeToInspect(handle1));
+  EXPECT_TRUE(snapshot2.DiagnosticsHandleIsSafeToInspect(handle2));
+
+  CordzHandle::Delete(handle2);
+  EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1));
+}
+
+TEST(CordzHandleTest, DiagnosticsGetSafeToInspectDeletedHandles) {
+  EXPECT_THAT(DeleteQueue(), IsEmpty());
+
+  auto* handle = new CordzHandle();
+  auto* snapshot1 = new CordzSnapshot();
+
+  // snapshot1 should be able to see handle.
+  EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot1));
+  EXPECT_TRUE(snapshot1->DiagnosticsHandleIsSafeToInspect(handle));
+  EXPECT_THAT(snapshot1->DiagnosticsGetSafeToInspectDeletedHandles(),
+              IsEmpty());
+
+  // This handle will be safe to inspect as long as snapshot1 is alive. However,
+  // since only snapshot1 can prove that it's alive, it will be hidden from
+  // snapshot2.
+  CordzHandle::Delete(handle);
+
+  // This snapshot shouldn't be able to see handle because handle was already
+  // sent to Delete.
+  auto* snapshot2 = new CordzSnapshot();
+
+  // DeleteQueue elements are LIFO order.
+  EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot2, handle, snapshot1));
+
+  EXPECT_TRUE(snapshot1->DiagnosticsHandleIsSafeToInspect(handle));
+  EXPECT_FALSE(snapshot2->DiagnosticsHandleIsSafeToInspect(handle));
+
+  EXPECT_THAT(snapshot1->DiagnosticsGetSafeToInspectDeletedHandles(),
+              ElementsAre(handle));
+  EXPECT_THAT(snapshot2->DiagnosticsGetSafeToInspectDeletedHandles(),
+              IsEmpty());
+
+  CordzHandle::Delete(snapshot1);
+  EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot2));
+
+  CordzHandle::Delete(snapshot2);
+  EXPECT_THAT(DeleteQueue(), IsEmpty());
+}
+
+// Create and delete CordzHandle and CordzSnapshot objects in multiple threads
+// so that tsan has some time to chew on it and look for memory problems.
+TEST(CordzHandleTest, MultiThreaded) {
+  Notification stop;
+  static constexpr int kNumThreads = 4;
+  // Keep the number of handles relatively small so that the test will naturally
+  // transition to an empty delete queue during the test. If there are, say, 100
+  // handles, that will virtually never happen. With 10 handles and around 50k
+  // iterations in each of 4 threads, the delete queue appears to become empty
+  // around 200 times.
+  static constexpr int kNumHandles = 10;
+
+  // Each thread is going to pick a random index and atomically swap its
+  // CordzHandle with one in handles. This way, each thread can avoid
+  // manipulating a CordzHandle that might be operated upon in another thread.
+  std::vector<std::atomic<CordzHandle*>> handles(kNumHandles);
+
+  // global bool which is set when any thread did get some 'safe to inspect'
+  // handles. On some platforms and OSS tests, we might risk that some pool
+  // threads are starved, stalled, or just got a few unlikely random 'handle'
+  // coin tosses, so we satisfy this test with simply observing 'some' thread
+  // did something meaningful, which should minimize the potential for flakes.
+  std::atomic<bool> found_safe_to_inspect(false);
+
+  {
+    absl::synchronization_internal::ThreadPool pool(kNumThreads);
+    for (int i = 0; i < kNumThreads; ++i) {
+      pool.Schedule([&stop, &handles, &found_safe_to_inspect]() {
+        std::minstd_rand gen;
+        std::uniform_int_distribution<int> dist_type(0, 2);
+        std::uniform_int_distribution<int> dist_handle(0, kNumHandles - 1);
+
+        while (!stop.HasBeenNotified()) {
+          CordzHandle* handle;
+          switch (dist_type(gen)) {
+            case 0:
+              handle = new CordzHandle();
+              break;
+            case 1:
+              handle = new CordzSnapshot();
+              break;
+            default:
+              handle = nullptr;
+              break;
+          }
+          CordzHandle* old_handle = handles[dist_handle(gen)].exchange(handle);
+          if (old_handle != nullptr) {
+            std::vector<const CordzHandle*> safe_to_inspect =
+                old_handle->DiagnosticsGetSafeToInspectDeletedHandles();
+            for (const CordzHandle* handle : safe_to_inspect) {
+              // We're in a tight loop, so don't generate too many error
+              // messages.
+              ASSERT_FALSE(handle->is_snapshot());
+            }
+            if (!safe_to_inspect.empty()) {
+              found_safe_to_inspect.store(true);
+            }
+            CordzHandle::Delete(old_handle);
+          }
+        }
+
+        // Have each thread attempt to clean up everything. Some thread will be
+        // the last to reach this cleanup code, and it will be guaranteed to
+        // clean up everything because nothing remains to create new handles.
+        for (auto& h : handles) {
+          if (CordzHandle* handle = h.exchange(nullptr)) {
+            CordzHandle::Delete(handle);
+          }
+        }
+      });
+    }
+
+    // The threads will hammer away.  Give it a little bit of time for tsan to
+    // spot errors.
+    absl::SleepFor(absl::Seconds(3));
+    stop.Notify();
+  }
+
+  // Confirm that the test did *something*. This check will be satisfied as
+  // long as any thread has deleted a CordzSnapshot object and a non-snapshot
+  // CordzHandle was deleted after the CordzSnapshot was created.
+  // See also comments on `found_safe_to_inspect`
+  EXPECT_TRUE(found_safe_to_inspect.load());
+}
+
+}  // namespace
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_info.cc
new file mode 100644
index 0000000000..a3a0b9c046
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info.cc
@@ -0,0 +1,436 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_info.h"
+
+#include "absl/base/config.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/strings/internal/cord_internal.h"
+#include "absl/strings/internal/cord_rep_ring.h"
+#include "absl/strings/internal/cordz_handle.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+using ::absl::base_internal::SpinLockHolder;
+
+constexpr int CordzInfo::kMaxStackDepth;
+
+ABSL_CONST_INIT CordzInfo::List CordzInfo::global_list_{absl::kConstInit};
+
+namespace {
+
+// CordRepAnalyzer performs the analysis of a cord.
+//
+// It computes absolute node counts and total memory usage, and an 'estimated
+// fair share memory usage` statistic.
+// Conceptually, it divides the 'memory usage' at each location in the 'cord
+// graph' by the cumulative reference count of that location. The cumulative
+// reference count is the factored total of all edges leading into that node.
+//
+// The top level node is treated specially: we assume the current thread
+// (typically called from the CordzHandler) to hold a reference purely to
+// perform a safe analysis, and not being part of the application. So we
+// substract 1 from the reference count of the top node to compute the
+// 'application fair share' excluding the reference of the current thread.
+//
+// An example of fair sharing, and why we multiply reference counts:
+// Assume we have 2 CordReps, both being a Substring referencing a Flat:
+//   CordSubstring A (refcount = 5) --> child Flat C (refcount = 2)
+//   CordSubstring B (refcount = 9) --> child Flat C (refcount = 2)
+//
+// Flat C has 2 incoming edges from the 2 substrings (refcount = 2) and is not
+// referenced directly anywhere else. Translated into a 'fair share', we then
+// attribute 50% of the memory (memory / refcount = 2) to each incoming edge.
+// Rep A has a refcount of 5, so we attribute each incoming edge 1 / 5th of the
+// memory cost below it, i.e.: the fair share of Rep A of the memory used by C
+// is then 'memory C / (refcount C * refcount A) + (memory A / refcount A)'.
+// It is also easy to see how all incoming edges add up to 100%.
+class CordRepAnalyzer {
+ public:
+  // Creates an analyzer instance binding to `statistics`.
+  explicit CordRepAnalyzer(CordzStatistics& statistics)
+      : statistics_(statistics) {}
+
+  // Analyzes the memory statistics and node counts for the provided `rep`, and
+  // adds the results to `statistics`. Note that node counts and memory sizes
+  // are not initialized, computed values are added to any existing values.
+  void AnalyzeCordRep(const CordRep* rep) {
+    // Process all linear nodes.
+    // As per the class comments, use refcout - 1 on the top level node, as the
+    // top level node is assumed to be referenced only for analysis purposes.
+    size_t refcount = rep->refcount.Get();
+    RepRef repref{rep, (refcount > 1) ? refcount - 1 : 1};
+
+    // Process all top level linear nodes (substrings and flats).
+    repref = CountLinearReps(repref, memory_usage_);
+
+    // We should have have either a concat or ring node node if not null.
+    if (repref.rep != nullptr) {
+      assert(repref.rep->tag == RING || repref.rep->tag == CONCAT);
+      if (repref.rep->tag == RING) {
+        AnalyzeRing(repref);
+      } else if (repref.rep->tag == CONCAT) {
+        AnalyzeConcat(repref);
+      }
+    }
+
+    // Adds values to output
+    statistics_.estimated_memory_usage += memory_usage_.total;
+    statistics_.estimated_fair_share_memory_usage += memory_usage_.fair_share;
+  }
+
+ private:
+  // RepRef identifies a CordRep* inside the Cord tree with its cumulative
+  // refcount including itself. For example, a tree consisting of a substring
+  // with a refcount of 3 and a child flat with a refcount of 4 will have RepRef
+  // refcounts of 3 and 12 respectively.
+  struct RepRef {
+    const CordRep* rep;
+    size_t refcount;
+
+    // Returns a 'child' RepRef which contains the cumulative reference count of
+    // this instance multiplied by the child's reference count.
+    RepRef Child(const CordRep* child) const {
+      return RepRef{child, refcount * child->refcount.Get()};
+    }
+  };
+
+  // Memory usage values
+  struct MemoryUsage {
+    size_t total = 0;
+    size_t fair_share = 0;
+
+    // Adds 'size` memory usage to this class, with a cumulative (recursive)
+    // reference count of `refcount`
+    void Add(size_t size, size_t refcount) {
+      total += size;
+      fair_share += size / refcount;
+    }
+  };
+
+  // Returns `rr` if `rr.rep` is not null and a CONCAT type.
+  // Asserts that `rr.rep` is a concat node or null.
+  static RepRef AssertConcat(RepRef repref) {
+    const CordRep* rep = repref.rep;
+    assert(rep == nullptr || rep->tag == CONCAT);
+    return (rep != nullptr && rep->tag == CONCAT) ? repref : RepRef{nullptr, 0};
+  }
+
+  // Counts a flat of the provide allocated size
+  void CountFlat(size_t size) {
+    statistics_.node_count++;
+    statistics_.node_counts.flat++;
+    if (size <= 64) {
+      statistics_.node_counts.flat_64++;
+    } else if (size <= 128) {
+      statistics_.node_counts.flat_128++;
+    } else if (size <= 256) {
+      statistics_.node_counts.flat_256++;
+    } else if (size <= 512) {
+      statistics_.node_counts.flat_512++;
+    } else if (size <= 1024) {
+      statistics_.node_counts.flat_1k++;
+    }
+  }
+
+  // Processes 'linear' reps (substring, flat, external) not requiring iteration
+  // or recursion. Returns RefRep{null} if all reps were processed, else returns
+  // the top-most non-linear concat or ring cordrep.
+  // Node counts are updated into `statistics_`, memory usage is update into
+  // `memory_usage`, which typically references `memory_usage_` except for ring
+  // buffers where we count children unrounded.
+  RepRef CountLinearReps(RepRef rep, MemoryUsage& memory_usage) {
+    // Consume all substrings
+    while (rep.rep->tag == SUBSTRING) {
+      statistics_.node_count++;
+      statistics_.node_counts.substring++;
+      memory_usage.Add(sizeof(CordRepSubstring), rep.refcount);
+      rep = rep.Child(rep.rep->substring()->child);
+    }
+
+    // Consume possible FLAT
+    if (rep.rep->tag >= FLAT) {
+      size_t size = rep.rep->flat()->AllocatedSize();
+      CountFlat(size);
+      memory_usage.Add(size, rep.refcount);
+      return RepRef{nullptr, 0};
+    }
+
+    // Consume possible external
+    if (rep.rep->tag == EXTERNAL) {
+      statistics_.node_count++;
+      statistics_.node_counts.external++;
+      size_t size = rep.rep->length + sizeof(CordRepExternalImpl<intptr_t>);
+      memory_usage.Add(size, rep.refcount);
+      return RepRef{nullptr, 0};
+    }
+
+    return rep;
+  }
+
+  // Analyzes the provided concat node in a flattened recursive way.
+  void AnalyzeConcat(RepRef rep) {
+    absl::InlinedVector<RepRef, 47> pending;
+
+    while (rep.rep != nullptr) {
+      const CordRepConcat* concat = rep.rep->concat();
+      RepRef left = rep.Child(concat->left);
+      RepRef right = rep.Child(concat->right);
+
+      statistics_.node_count++;
+      statistics_.node_counts.concat++;
+      memory_usage_.Add(sizeof(CordRepConcat), rep.refcount);
+
+      right = AssertConcat(CountLinearReps(right, memory_usage_));
+      rep = AssertConcat(CountLinearReps(left, memory_usage_));
+      if (rep.rep != nullptr) {
+        if (right.rep != nullptr) {
+          pending.push_back(right);
+        }
+      } else if (right.rep != nullptr) {
+        rep = right;
+      } else if (!pending.empty()) {
+        rep = pending.back();
+        pending.pop_back();
+      }
+    }
+  }
+
+  // Counts the provided ring buffer child into `child_usage`.
+  void CountRingChild(const CordRep* child, MemoryUsage& child_usage) {
+    RepRef rep{child, static_cast<size_t>(child->refcount.Get())};
+    rep = CountLinearReps(rep, child_usage);
+    assert(rep.rep == nullptr);
+  }
+
+  // Analyzes the provided ring. As ring buffers can have many child nodes, the
+  // effect of rounding errors can become non trivial, so we compute the totals
+  // first at the ring level, and then divide the fair share of the total
+  // including children fair share totals.
+  void AnalyzeRing(RepRef rep) {
+    statistics_.node_count++;
+    statistics_.node_counts.ring++;
+    MemoryUsage ring_usage;
+    const CordRepRing* ring = rep.rep->ring();
+    ring_usage.Add(CordRepRing::AllocSize(ring->capacity()), 1);
+    ring->ForEach([&](CordRepRing::index_type pos) {
+      CountRingChild(ring->entry_child(pos), ring_usage);
+    });
+    memory_usage_.total += ring_usage.total;
+    memory_usage_.fair_share += ring_usage.fair_share / rep.refcount;
+  }
+
+  CordzStatistics& statistics_;
+  MemoryUsage memory_usage_;
+};
+
+}  // namespace
+
+CordzInfo* CordzInfo::Head(const CordzSnapshot& snapshot) {
+  ABSL_ASSERT(snapshot.is_snapshot());
+
+  // We can do an 'unsafe' load of 'head', as we are guaranteed that the
+  // instance it points to is kept alive by the provided CordzSnapshot, so we
+  // can simply return the current value using an acquire load.
+  // We do enforce in DEBUG builds that the 'head' value is present in the
+  // delete queue: ODR violations may lead to 'snapshot' and 'global_list_'
+  // being in different libraries / modules.
+  CordzInfo* head = global_list_.head.load(std::memory_order_acquire);
+  ABSL_ASSERT(snapshot.DiagnosticsHandleIsSafeToInspect(head));
+  return head;
+}
+
+CordzInfo* CordzInfo::Next(const CordzSnapshot& snapshot) const {
+  ABSL_ASSERT(snapshot.is_snapshot());
+
+  // Similar to the 'Head()' function, we do not need a mutex here.
+  CordzInfo* next = ci_next_.load(std::memory_order_acquire);
+  ABSL_ASSERT(snapshot.DiagnosticsHandleIsSafeToInspect(this));
+  ABSL_ASSERT(snapshot.DiagnosticsHandleIsSafeToInspect(next));
+  return next;
+}
+
+void CordzInfo::TrackCord(InlineData& cord, MethodIdentifier method) {
+  assert(cord.is_tree());
+  assert(!cord.is_profiled());
+  CordzInfo* cordz_info = new CordzInfo(cord.as_tree(), nullptr, method);
+  cord.set_cordz_info(cordz_info);
+  cordz_info->Track();
+}
+
+void CordzInfo::TrackCord(InlineData& cord, const InlineData& src,
+                          MethodIdentifier method) {
+  assert(cord.is_tree());
+  assert(src.is_tree());
+
+  // Unsample current as we the current cord is being replaced with 'src',
+  // so any method history is no longer relevant.
+  CordzInfo* cordz_info = cord.cordz_info();
+  if (cordz_info != nullptr) cordz_info->Untrack();
+
+  // Start new cord sample
+  cordz_info = new CordzInfo(cord.as_tree(), src.cordz_info(), method);
+  cord.set_cordz_info(cordz_info);
+  cordz_info->Track();
+}
+
+void CordzInfo::MaybeTrackCordImpl(InlineData& cord, const InlineData& src,
+                                   MethodIdentifier method) {
+  if (src.is_profiled()) {
+    TrackCord(cord, src, method);
+  } else if (cord.is_profiled()) {
+    cord.cordz_info()->Untrack();
+    cord.clear_cordz_info();
+  }
+}
+
+CordzInfo::MethodIdentifier CordzInfo::GetParentMethod(const CordzInfo* src) {
+  if (src == nullptr) return MethodIdentifier::kUnknown;
+  return src->parent_method_ != MethodIdentifier::kUnknown ? src->parent_method_
+                                                           : src->method_;
+}
+
+int CordzInfo::FillParentStack(const CordzInfo* src, void** stack) {
+  assert(stack);
+  if (src == nullptr) return 0;
+  if (src->parent_stack_depth_) {
+    memcpy(stack, src->parent_stack_, src->parent_stack_depth_ * sizeof(void*));
+    return src->parent_stack_depth_;
+  }
+  memcpy(stack, src->stack_, src->stack_depth_ * sizeof(void*));
+  return src->stack_depth_;
+}
+
+CordzInfo::CordzInfo(CordRep* rep, const CordzInfo* src,
+                     MethodIdentifier method)
+    : rep_(rep),
+      stack_depth_(absl::GetStackTrace(stack_, /*max_depth=*/kMaxStackDepth,
+                                       /*skip_count=*/1)),
+      parent_stack_depth_(FillParentStack(src, parent_stack_)),
+      method_(method),
+      parent_method_(GetParentMethod(src)),
+      create_time_(absl::Now()) {
+  update_tracker_.LossyAdd(method);
+  if (src) {
+    // Copy parent counters.
+    update_tracker_.LossyAdd(src->update_tracker_);
+  }
+}
+
+CordzInfo::~CordzInfo() {
+  // `rep_` is potentially kept alive if CordzInfo is included
+  // in a collection snapshot (which should be rare).
+  if (ABSL_PREDICT_FALSE(rep_)) {
+    CordRep::Unref(rep_);
+  }
+}
+
+void CordzInfo::Track() {
+  SpinLockHolder l(&list_->mutex);
+
+  CordzInfo* const head = list_->head.load(std::memory_order_acquire);
+  if (head != nullptr) {
+    head->ci_prev_.store(this, std::memory_order_release);
+  }
+  ci_next_.store(head, std::memory_order_release);
+  list_->head.store(this, std::memory_order_release);
+}
+
+void CordzInfo::Untrack() {
+  ODRCheck();
+  {
+    SpinLockHolder l(&list_->mutex);
+
+    CordzInfo* const head = list_->head.load(std::memory_order_acquire);
+    CordzInfo* const next = ci_next_.load(std::memory_order_acquire);
+    CordzInfo* const prev = ci_prev_.load(std::memory_order_acquire);
+
+    if (next) {
+      ABSL_ASSERT(next->ci_prev_.load(std::memory_order_acquire) == this);
+      next->ci_prev_.store(prev, std::memory_order_release);
+    }
+    if (prev) {
+      ABSL_ASSERT(head != this);
+      ABSL_ASSERT(prev->ci_next_.load(std::memory_order_acquire) == this);
+      prev->ci_next_.store(next, std::memory_order_release);
+    } else {
+      ABSL_ASSERT(head == this);
+      list_->head.store(next, std::memory_order_release);
+    }
+  }
+
+  // We can no longer be discovered: perform a fast path check if we are not
+  // listed on any delete queue, so we can directly delete this instance.
+  if (SafeToDelete()) {
+    UnsafeSetCordRep(nullptr);
+    delete this;
+    return;
+  }
+
+  // We are likely part of a snapshot, extend the life of the CordRep
+  {
+    absl::MutexLock lock(&mutex_);
+    if (rep_) CordRep::Ref(rep_);
+  }
+  CordzHandle::Delete(this);
+}
+
+void CordzInfo::Lock(MethodIdentifier method)
+    ABSL_EXCLUSIVE_LOCK_FUNCTION(mutex_) {
+  mutex_.Lock();
+  update_tracker_.LossyAdd(method);
+  assert(rep_);
+}
+
+void CordzInfo::Unlock() ABSL_UNLOCK_FUNCTION(mutex_) {
+  bool tracked = rep_ != nullptr;
+  mutex_.Unlock();
+  if (!tracked) {
+    Untrack();
+  }
+}
+
+absl::Span<void* const> CordzInfo::GetStack() const {
+  return absl::MakeConstSpan(stack_, stack_depth_);
+}
+
+absl::Span<void* const> CordzInfo::GetParentStack() const {
+  return absl::MakeConstSpan(parent_stack_, parent_stack_depth_);
+}
+
+CordzStatistics CordzInfo::GetCordzStatistics() const {
+  CordzStatistics stats;
+  stats.method = method_;
+  stats.parent_method = parent_method_;
+  stats.update_tracker = update_tracker_;
+  if (CordRep* rep = RefCordRep()) {
+    stats.size = rep->length;
+    CordRepAnalyzer analyzer(stats);
+    analyzer.AnalyzeCordRep(rep);
+    CordRep::Unref(rep);
+  }
+  return stats;
+}
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info.h b/third_party/abseil-cpp/absl/strings/internal/cordz_info.h
new file mode 100644
index 0000000000..026d5b9981
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info.h
@@ -0,0 +1,298 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_CORDZ_INFO_H_
+#define ABSL_STRINGS_CORDZ_INFO_H_
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+
+#include "absl/base/config.h"
+#include "absl/base/internal/raw_logging.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/internal/cord_internal.h"
+#include "absl/strings/internal/cordz_functions.h"
+#include "absl/strings/internal/cordz_handle.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// CordzInfo tracks a profiled Cord. Each of these objects can be in two places.
+// If a Cord is alive, the CordzInfo will be in the global_cordz_infos map, and
+// can also be retrieved via the linked list starting with
+// global_cordz_infos_head and continued via the cordz_info_next() method. When
+// a Cord has reached the end of its lifespan, the CordzInfo object will be
+// migrated out of the global_cordz_infos list and the global_cordz_infos_map,
+// and will either be deleted or appended to the global_delete_queue. If it is
+// placed on the global_delete_queue, the CordzInfo object will be cleaned in
+// the destructor of a CordzSampleToken object.
+class ABSL_LOCKABLE CordzInfo : public CordzHandle {
+ public:
+  using MethodIdentifier = CordzUpdateTracker::MethodIdentifier;
+
+  // TrackCord creates a CordzInfo instance which tracks important metrics of
+  // a sampled cord, and stores the created CordzInfo instance into `cord'. All
+  // CordzInfo instances are placed in a global list which is used to discover
+  // and snapshot all actively tracked cords. Callers are responsible for
+  // calling UntrackCord() before the tracked Cord instance is deleted, or to
+  // stop tracking the sampled Cord. Callers are also responsible for guarding
+  // changes to the 'tree' value of a Cord (InlineData.tree) through the Lock()
+  // and Unlock() calls. Any change resulting in a new tree value for the cord
+  // requires a call to SetCordRep() before the old tree has been unreffed
+  // and/or deleted. `method` identifies the Cord public API method initiating
+  // the cord to be sampled.
+  // Requires `cord` to hold a tree, and `cord.cordz_info()` to be null.
+  static void TrackCord(InlineData& cord, MethodIdentifier method);
+
+  // Identical to TrackCord(), except that this function fills the
+  // `parent_stack` and `parent_method` properties of the returned CordzInfo
+  // instance from the provided `src` instance if `src` is sampled.
+  // This function should be used for sampling 'copy constructed' and 'copy
+  // assigned' cords. This function allows 'cord` to be already sampled, in
+  // which case the CordzInfo will be newly created from `src`.
+  static void TrackCord(InlineData& cord, const InlineData& src,
+                        MethodIdentifier method);
+
+  // Maybe sample the cord identified by 'cord' for method 'method'.
+  // Uses `cordz_should_profile` to randomly pick cords to be sampled, and if
+  // so, invokes `TrackCord` to start sampling `cord`.
+  static void MaybeTrackCord(InlineData& cord, MethodIdentifier method);
+
+  // Maybe sample the cord identified by 'cord' for method 'method'.
+  // `src` identifies a 'parent' cord which is assigned to `cord`, typically the
+  // input cord for a copy constructor, or an assign method such as `operator=`
+  // `cord` will be sampled if (and only if) `src` is sampled.
+  // If `cord` is currently being sampled and `src` is not being sampled, then
+  // this function will stop sampling the cord and reset the cord's cordz_info.
+  //
+  // Previously this function defined that `cord` will be sampled if either
+  // `src` is sampled, or if `cord` is randomly picked for sampling. However,
+  // this can cause issues, as there may be paths where some cord is assigned an
+  // indirect copy of it's own value. As such a 'string of copies' would then
+  // remain sampled (`src.is_profiled`), then assigning such a cord back to
+  // 'itself' creates a cycle where the cord will converge to 'always sampled`.
+  //
+  // For example:
+  //
+  //   Cord x;
+  //   for (...) {
+  //     // Copy ctor --> y.is_profiled := x.is_profiled | random(...)
+  //     Cord y = x;
+  //     ...
+  //     // Assign x = y --> x.is_profiled = y.is_profiled | random(...)
+  //     //              ==> x.is_profiled |= random(...)
+  //     //              ==> x converges to 'always profiled'
+  //     x = y;
+  //   }
+  static void MaybeTrackCord(InlineData& cord, const InlineData& src,
+                             MethodIdentifier method);
+
+  // Stops tracking changes for a sampled cord, and deletes the provided info.
+  // This function must be called before the sampled cord instance is deleted,
+  // and before the root cordrep of the sampled cord is unreffed.
+  // This function may extend the lifetime of the cordrep in cases where the
+  // CordInfo instance is being held by a concurrent collection thread.
+  void Untrack();
+
+  // Invokes UntrackCord() on `info` if `info` is not null.
+  static void MaybeUntrackCord(CordzInfo* info);
+
+  CordzInfo() = delete;
+  CordzInfo(const CordzInfo&) = delete;
+  CordzInfo& operator=(const CordzInfo&) = delete;
+
+  // Retrieves the oldest existing CordzInfo.
+  static CordzInfo* Head(const CordzSnapshot& snapshot)
+      ABSL_NO_THREAD_SAFETY_ANALYSIS;
+
+  // Retrieves the next oldest existing CordzInfo older than 'this' instance.
+  CordzInfo* Next(const CordzSnapshot& snapshot) const
+      ABSL_NO_THREAD_SAFETY_ANALYSIS;
+
+  // Locks this instance for the update identified by `method`.
+  // Increases the count for `method` in `update_tracker`.
+  void Lock(MethodIdentifier method) ABSL_EXCLUSIVE_LOCK_FUNCTION(mutex_);
+
+  // Unlocks this instance. If the contained `rep` has been set to null
+  // indicating the Cord has been cleared or is otherwise no longer sampled,
+  // then this method will delete this CordzInfo instance.
+  void Unlock() ABSL_UNLOCK_FUNCTION(mutex_);
+
+  // Asserts that this CordzInfo instance is locked.
+  void AssertHeld() ABSL_ASSERT_EXCLUSIVE_LOCK(mutex_);
+
+  // Updates the `rep` property of this instance. This methods is invoked by
+  // Cord logic each time the root node of a sampled Cord changes, and before
+  // the old root reference count is deleted. This guarantees that collection
+  // code can always safely take a reference on the tracked cord.
+  // Requires a lock to be held through the `Lock()` method.
+  // TODO(b/117940323): annotate with ABSL_EXCLUSIVE_LOCKS_REQUIRED once all
+  // Cord code is in a state where this can be proven true by the compiler.
+  void SetCordRep(CordRep* rep);
+
+  // Returns the current `rep` property of this instance with a reference
+  // added, or null if this instance represents a cord that has since been
+  // deleted or untracked.
+  CordRep* RefCordRep() const ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Returns the current value of `rep_` for testing purposes only.
+  CordRep* GetCordRepForTesting() const ABSL_NO_THREAD_SAFETY_ANALYSIS {
+    return rep_;
+  }
+
+  // Sets the current value of `rep_` for testing purposes only.
+  void SetCordRepForTesting(CordRep* rep) ABSL_NO_THREAD_SAFETY_ANALYSIS {
+    rep_ = rep;
+  }
+
+  // Returns the stack trace for where the cord was first sampled. Cords are
+  // potentially sampled when they promote from an inlined cord to a tree or
+  // ring representation, which is not necessarily the location where the cord
+  // was first created. Some cords are created as inlined cords, and only as
+  // data is added do they become a non-inlined cord. However, typically the
+  // location represents reasonably well where the cord is 'created'.
+  absl::Span<void* const> GetStack() const;
+
+  // Returns the stack trace for a sampled cord's 'parent stack trace'. This
+  // value may be set if the cord is sampled (promoted) after being created
+  // from, or being assigned the value of an existing (sampled) cord.
+  absl::Span<void* const> GetParentStack() const;
+
+  // Retrieves the CordzStatistics associated with this Cord. The statistics
+  // are only updated when a Cord goes through a mutation, such as an Append
+  // or RemovePrefix.
+  CordzStatistics GetCordzStatistics() const;
+
+ private:
+  using SpinLock = absl::base_internal::SpinLock;
+  using SpinLockHolder = ::absl::base_internal::SpinLockHolder;
+
+  // Global cordz info list. CordzInfo stores a pointer to the global list
+  // instance to harden against ODR violations.
+  struct List {
+    constexpr explicit List(absl::ConstInitType)
+        : mutex(absl::kConstInit,
+                absl::base_internal::SCHEDULE_COOPERATIVE_AND_KERNEL) {}
+
+    SpinLock mutex;
+    std::atomic<CordzInfo*> head ABSL_GUARDED_BY(mutex){nullptr};
+  };
+
+  static constexpr int kMaxStackDepth = 64;
+
+  explicit CordzInfo(CordRep* rep, const CordzInfo* src,
+                     MethodIdentifier method);
+  ~CordzInfo() override;
+
+  // Sets `rep_` without holding a lock.
+  void UnsafeSetCordRep(CordRep* rep) ABSL_NO_THREAD_SAFETY_ANALYSIS;
+
+  void Track();
+
+  // Returns the parent method from `src`, which is either `parent_method_` or
+  // `method_` depending on `parent_method_` being kUnknown.
+  // Returns kUnknown if `src` is null.
+  static MethodIdentifier GetParentMethod(const CordzInfo* src);
+
+  // Fills the provided stack from `src`, copying either `parent_stack_` or
+  // `stack_` depending on `parent_stack_` being empty, returning the size of
+  // the parent stack.
+  // Returns 0 if `src` is null.
+  static int FillParentStack(const CordzInfo* src, void** stack);
+
+  void ODRCheck() const {
+#ifndef NDEBUG
+    ABSL_RAW_CHECK(list_ == &global_list_, "ODR violation in Cord");
+#endif
+  }
+
+  // Non-inlined implementation of `MaybeTrackCord`, which is executed if
+  // either `src` is sampled or `cord` is sampled, and either untracks or
+  // tracks `cord` as documented per `MaybeTrackCord`.
+  static void MaybeTrackCordImpl(InlineData& cord, const InlineData& src,
+                                 MethodIdentifier method);
+
+  ABSL_CONST_INIT static List global_list_;
+  List* const list_ = &global_list_;
+
+  // ci_prev_ and ci_next_ require the global list mutex to be held.
+  // Unfortunately we can't use thread annotations such that the thread safety
+  // analysis understands that list_ and global_list_ are one and the same.
+  std::atomic<CordzInfo*> ci_prev_{nullptr};
+  std::atomic<CordzInfo*> ci_next_{nullptr};
+
+  mutable absl::Mutex mutex_;
+  CordRep* rep_ ABSL_GUARDED_BY(mutex_);
+
+  void* stack_[kMaxStackDepth];
+  void* parent_stack_[kMaxStackDepth];
+  const int stack_depth_;
+  const int parent_stack_depth_;
+  const MethodIdentifier method_;
+  const MethodIdentifier parent_method_;
+  CordzUpdateTracker update_tracker_;
+  const absl::Time create_time_;
+};
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CordzInfo::MaybeTrackCord(
+    InlineData& cord, MethodIdentifier method) {
+  if (ABSL_PREDICT_FALSE(cordz_should_profile())) {
+    TrackCord(cord, method);
+  }
+}
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CordzInfo::MaybeTrackCord(
+    InlineData& cord, const InlineData& src, MethodIdentifier method) {
+  if (ABSL_PREDICT_FALSE(InlineData::is_either_profiled(cord, src))) {
+    MaybeTrackCordImpl(cord, src, method);
+  }
+}
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CordzInfo::MaybeUntrackCord(
+    CordzInfo* info) {
+  if (ABSL_PREDICT_FALSE(info)) {
+    info->Untrack();
+  }
+}
+
+inline void CordzInfo::AssertHeld() ABSL_ASSERT_EXCLUSIVE_LOCK(mutex_) {
+#ifndef NDEBUG
+  mutex_.AssertHeld();
+#endif
+}
+
+inline void CordzInfo::SetCordRep(CordRep* rep) {
+  AssertHeld();
+  rep_ = rep;
+}
+
+inline void CordzInfo::UnsafeSetCordRep(CordRep* rep) { rep_ = rep; }
+
+inline CordRep* CordzInfo::RefCordRep() const ABSL_LOCKS_EXCLUDED(mutex_) {
+  MutexLock lock(&mutex_);
+  return rep_ ? CordRep::Ref(rep_) : nullptr;
+}
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_CORDZ_INFO_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info_statistics_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_info_statistics_test.cc
new file mode 100644
index 0000000000..9f2842d97d
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info_statistics_test.cc
@@ -0,0 +1,508 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/internal/cord_internal.h"
+#include "absl/strings/internal/cord_rep_flat.h"
+#include "absl/strings/internal/cord_rep_ring.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/strings/internal/cordz_sample_token.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_scope.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+#include "absl/synchronization/internal/thread_pool.h"
+#include "absl/synchronization/notification.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// Do not print statistics contents, the matcher prints them as needed.
+inline void PrintTo(const CordzStatistics& stats, std::ostream* s) {
+  if (s) *s << "CordzStatistics{...}";
+}
+
+namespace {
+
+// Creates a flat of the specified allocated size
+CordRepFlat* Flat(size_t size) {
+  // Round up to a tag size, as we are going to poke an exact tag size back into
+  // the allocated flat. 'size returning allocators' could grant us more than we
+  // wanted, but we are ok to poke the 'requested' size in the tag, even in the
+  // presence of sized deletes, so we need to make sure the size rounds
+  // perfectly to a tag value.
+  assert(size >= kMinFlatSize);
+  size = RoundUpForTag(size);
+  CordRepFlat* flat = CordRepFlat::New(size - kFlatOverhead);
+  flat->tag = AllocatedSizeToTag(size);
+  flat->length = size - kFlatOverhead;
+  return flat;
+}
+
+// Creates an external of the specified length
+CordRepExternal* External(int length = 512) {
+  return static_cast<CordRepExternal*>(
+      NewExternalRep(absl::string_view("", length), [](absl::string_view) {}));
+}
+
+// Creates a substring on the provided rep of length - 1
+CordRepSubstring* Substring(CordRep* rep) {
+  auto* substring = new CordRepSubstring;
+  substring->length = rep->length - 1;
+  substring->tag = SUBSTRING;
+  substring->child = rep;
+  return substring;
+}
+
+// Creates a concat on the provided reps
+CordRepConcat* Concat(CordRep* left, CordRep* right) {
+  auto* concat = new CordRepConcat;
+  concat->length = left->length + right->length;
+  concat->tag = CONCAT;
+  concat->left = left;
+  concat->right = right;
+  return concat;
+}
+
+// Reference count helper
+struct RefHelper {
+  std::vector<CordRep*> refs;
+
+  ~RefHelper() {
+    for (CordRep* rep : refs) {
+      CordRep::Unref(rep);
+    }
+  }
+
+  // Invokes CordRep::Unref() on `rep` when this instance is destroyed.
+  template <typename T>
+  T* NeedsUnref(T* rep) {
+    refs.push_back(rep);
+    return rep;
+  }
+
+  // Adds `n` reference counts to `rep` which will be unreffed when this
+  // instance is destroyed.
+  template <typename T>
+  T* Ref(T* rep, size_t n = 1) {
+    while (n--) {
+      NeedsUnref(CordRep::Ref(rep));
+    }
+    return rep;
+  }
+};
+
+// Sizeof helper. Returns the allocated size of `p`, excluding any child
+// elements for substring, concat and ring cord reps.
+template <typename T>
+size_t SizeOf(const T* rep) {
+  return sizeof(T);
+}
+
+template <>
+size_t SizeOf(const CordRepFlat* rep) {
+  return rep->AllocatedSize();
+}
+
+template <>
+size_t SizeOf(const CordRepExternal* rep) {
+  // See cord.cc
+  return sizeof(CordRepExternalImpl<intptr_t>) + rep->length;
+}
+
+template <>
+size_t SizeOf(const CordRepRing* rep) {
+  return CordRepRing::AllocSize(rep->capacity());
+}
+
+// Computes fair share memory used in a naive 'we dare to recurse' way.
+size_t FairShare(CordRep* rep, size_t ref = 1) {
+  size_t self = 0, children = 0;
+  ref *= rep->refcount.Get();
+  if (rep->tag >= FLAT) {
+    self = SizeOf(rep->flat());
+  } else if (rep->tag == EXTERNAL) {
+    self = SizeOf(rep->external());
+  } else if (rep->tag == SUBSTRING) {
+    self = SizeOf(rep->substring());
+    children = FairShare(rep->substring()->child, ref);
+  } else if (rep->tag == RING) {
+    self = SizeOf(rep->ring());
+    rep->ring()->ForEach([&](CordRepRing::index_type i) {
+      self += FairShare(rep->ring()->entry_child(i));
+    });
+  } else if (rep->tag == CONCAT) {
+    self = SizeOf(rep->concat());
+    children = FairShare(rep->concat()->left, ref) +
+               FairShare(rep->concat()->right, ref);
+  } else {
+    assert(false);
+  }
+  return self / ref + children;
+}
+
+// Samples the cord and returns CordzInfo::GetStatistics()
+CordzStatistics SampleCord(CordRep* rep) {
+  InlineData cord(rep);
+  CordzInfo::TrackCord(cord, CordzUpdateTracker::kUnknown);
+  CordzStatistics stats = cord.cordz_info()->GetCordzStatistics();
+  cord.cordz_info()->Untrack();
+  return stats;
+}
+
+MATCHER_P(EqStatistics, stats, "Statistics equal expected values") {
+  bool ok = true;
+
+#define STATS_MATCHER_EXPECT_EQ(member)                              \
+  if (stats.member != arg.member) {                                  \
+    *result_listener << "\n    stats." << #member                    \
+                     << ": actual = " << arg.member << ", expected " \
+                     << stats.member;                                \
+    ok = false;                                                      \
+  }
+
+  STATS_MATCHER_EXPECT_EQ(size);
+  STATS_MATCHER_EXPECT_EQ(node_count);
+  STATS_MATCHER_EXPECT_EQ(node_counts.flat);
+  STATS_MATCHER_EXPECT_EQ(node_counts.flat_64);
+  STATS_MATCHER_EXPECT_EQ(node_counts.flat_128);
+  STATS_MATCHER_EXPECT_EQ(node_counts.flat_256);
+  STATS_MATCHER_EXPECT_EQ(node_counts.flat_512);
+  STATS_MATCHER_EXPECT_EQ(node_counts.flat_1k);
+  STATS_MATCHER_EXPECT_EQ(node_counts.external);
+  STATS_MATCHER_EXPECT_EQ(node_counts.concat);
+  STATS_MATCHER_EXPECT_EQ(node_counts.substring);
+  STATS_MATCHER_EXPECT_EQ(node_counts.ring);
+  STATS_MATCHER_EXPECT_EQ(estimated_memory_usage);
+  STATS_MATCHER_EXPECT_EQ(estimated_fair_share_memory_usage);
+
+#undef STATS_MATCHER_EXPECT_EQ
+
+  return ok;
+}
+
+TEST(CordzInfoStatisticsTest, Flat) {
+  RefHelper ref;
+  auto* flat = ref.NeedsUnref(Flat(512));
+
+  CordzStatistics expected;
+  expected.size = flat->length;
+  expected.estimated_memory_usage = SizeOf(flat);
+  expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage;
+  expected.node_count = 1;
+  expected.node_counts.flat = 1;
+  expected.node_counts.flat_512 = 1;
+
+  EXPECT_THAT(SampleCord(flat), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, SharedFlat) {
+  RefHelper ref;
+  auto* flat = ref.Ref(ref.NeedsUnref(Flat(64)));
+
+  CordzStatistics expected;
+  expected.size = flat->length;
+  expected.estimated_memory_usage = SizeOf(flat);
+  expected.estimated_fair_share_memory_usage = SizeOf(flat) / 2;
+  expected.node_count = 1;
+  expected.node_counts.flat = 1;
+  expected.node_counts.flat_64 = 1;
+
+  EXPECT_THAT(SampleCord(flat), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, External) {
+  RefHelper ref;
+  auto* external = ref.NeedsUnref(External());
+
+  CordzStatistics expected;
+  expected.size = external->length;
+  expected.estimated_memory_usage = SizeOf(external);
+  expected.estimated_fair_share_memory_usage = SizeOf(external);
+  expected.node_count = 1;
+  expected.node_counts.external = 1;
+
+  EXPECT_THAT(SampleCord(external), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, SharedExternal) {
+  RefHelper ref;
+  auto* external = ref.Ref(ref.NeedsUnref(External()));
+
+  CordzStatistics expected;
+  expected.size = external->length;
+  expected.estimated_memory_usage = SizeOf(external);
+  expected.estimated_fair_share_memory_usage = SizeOf(external) / 2;
+  expected.node_count = 1;
+  expected.node_counts.external = 1;
+
+  EXPECT_THAT(SampleCord(external), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, Substring) {
+  RefHelper ref;
+  auto* flat = Flat(1024);
+  auto* substring = ref.NeedsUnref(Substring(flat));
+
+  CordzStatistics expected;
+  expected.size = substring->length;
+  expected.estimated_memory_usage = SizeOf(substring) + SizeOf(flat);
+  expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage;
+  expected.node_count = 2;
+  expected.node_counts.flat = 1;
+  expected.node_counts.flat_1k = 1;
+  expected.node_counts.substring = 1;
+
+  EXPECT_THAT(SampleCord(substring), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, SharedSubstring) {
+  RefHelper ref;
+  auto* flat = ref.Ref(Flat(511), 2);
+  auto* substring = ref.Ref(ref.NeedsUnref(Substring(flat)));
+
+  CordzStatistics expected;
+  expected.size = substring->length;
+  expected.estimated_memory_usage = SizeOf(flat) + SizeOf(substring);
+  expected.estimated_fair_share_memory_usage =
+      SizeOf(substring) / 2 + SizeOf(flat) / 6;
+  expected.node_count = 2;
+  expected.node_counts.flat = 1;
+  expected.node_counts.flat_512 = 1;
+  expected.node_counts.substring = 1;
+
+  EXPECT_THAT(SampleCord(substring), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, Concat) {
+  RefHelper ref;
+  auto* flat1 = Flat(300);
+  auto* flat2 = Flat(2000);
+  auto* concat = ref.NeedsUnref(Concat(flat1, flat2));
+
+  CordzStatistics expected;
+  expected.size = concat->length;
+  expected.estimated_memory_usage =
+      SizeOf(concat) + SizeOf(flat1) + SizeOf(flat2);
+  expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage;
+  expected.node_count = 3;
+  expected.node_counts.flat = 2;
+  expected.node_counts.flat_512 = 1;
+  expected.node_counts.concat = 1;
+
+  EXPECT_THAT(SampleCord(concat), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, DeepConcat) {
+  RefHelper ref;
+  auto* flat1 = Flat(300);
+  auto* flat2 = Flat(2000);
+  auto* flat3 = Flat(400);
+  auto* external = External(3000);
+  auto* substring = Substring(external);
+  auto* concat1 = Concat(flat1, flat2);
+  auto* concat2 = Concat(flat3, substring);
+  auto* concat = ref.NeedsUnref(Concat(concat1, concat2));
+
+  CordzStatistics expected;
+  expected.size = concat->length;
+  expected.estimated_memory_usage = SizeOf(concat) * 3 + SizeOf(flat1) +
+                                    SizeOf(flat2) + SizeOf(flat3) +
+                                    SizeOf(external) + SizeOf(substring);
+  expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage;
+
+  expected.node_count = 8;
+  expected.node_counts.flat = 3;
+  expected.node_counts.flat_512 = 2;
+  expected.node_counts.external = 1;
+  expected.node_counts.concat = 3;
+  expected.node_counts.substring = 1;
+
+  EXPECT_THAT(SampleCord(concat), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, DeepSharedConcat) {
+  RefHelper ref;
+  auto* flat1 = Flat(40);
+  auto* flat2 = ref.Ref(Flat(2000), 4);
+  auto* flat3 = Flat(70);
+  auto* external = ref.Ref(External(3000));
+  auto* substring = ref.Ref(Substring(external), 3);
+  auto* concat1 = Concat(flat1, flat2);
+  auto* concat2 = Concat(flat3, substring);
+  auto* concat = ref.Ref(ref.NeedsUnref(Concat(concat1, concat2)));
+
+  CordzStatistics expected;
+  expected.size = concat->length;
+  expected.estimated_memory_usage = SizeOf(concat) * 3 + SizeOf(flat1) +
+                                    SizeOf(flat2) + SizeOf(flat3) +
+                                    SizeOf(external) + SizeOf(substring);
+  expected.estimated_fair_share_memory_usage = FairShare(concat);
+  expected.node_count = 8;
+  expected.node_counts.flat = 3;
+  expected.node_counts.flat_64 = 1;
+  expected.node_counts.flat_128 = 1;
+  expected.node_counts.external = 1;
+  expected.node_counts.concat = 3;
+  expected.node_counts.substring = 1;
+
+  EXPECT_THAT(SampleCord(concat), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, Ring) {
+  RefHelper ref;
+  auto* flat1 = Flat(240);
+  auto* flat2 = Flat(2000);
+  auto* flat3 = Flat(70);
+  auto* external = External(3000);
+  CordRepRing* ring = CordRepRing::Create(flat1);
+  ring = CordRepRing::Append(ring, flat2);
+  ring = CordRepRing::Append(ring, flat3);
+  ring = ref.NeedsUnref(CordRepRing::Append(ring, external));
+
+  CordzStatistics expected;
+  expected.size = ring->length;
+  expected.estimated_memory_usage = SizeOf(ring) + SizeOf(flat1) +
+                                    SizeOf(flat2) + SizeOf(flat3) +
+                                    SizeOf(external);
+  expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage;
+  expected.node_count = 5;
+  expected.node_counts.flat = 3;
+  expected.node_counts.flat_128 = 1;
+  expected.node_counts.flat_256 = 1;
+  expected.node_counts.external = 1;
+  expected.node_counts.ring = 1;
+
+  EXPECT_THAT(SampleCord(ring), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, SharedSubstringRing) {
+  RefHelper ref;
+  auto* flat1 = ref.Ref(Flat(240));
+  auto* flat2 = Flat(200);
+  auto* flat3 = Flat(70);
+  auto* external = ref.Ref(External(3000), 5);
+  CordRepRing* ring = CordRepRing::Create(flat1);
+  ring = CordRepRing::Append(ring, flat2);
+  ring = CordRepRing::Append(ring, flat3);
+  ring = ref.Ref(CordRepRing::Append(ring, external), 4);
+  auto* substring = ref.Ref(ref.NeedsUnref(Substring(ring)));
+
+
+  CordzStatistics expected;
+  expected.size = substring->length;
+  expected.estimated_memory_usage = SizeOf(ring) + SizeOf(flat1) +
+                                    SizeOf(flat2) + SizeOf(flat3) +
+                                    SizeOf(external) + SizeOf(substring);
+  expected.estimated_fair_share_memory_usage = FairShare(substring);
+  expected.node_count = 6;
+  expected.node_counts.flat = 3;
+  expected.node_counts.flat_128 = 1;
+  expected.node_counts.flat_256 = 2;
+  expected.node_counts.external = 1;
+  expected.node_counts.ring = 1;
+  expected.node_counts.substring = 1;
+
+  EXPECT_THAT(SampleCord(substring), EqStatistics(expected));
+}
+
+TEST(CordzInfoStatisticsTest, ThreadSafety) {
+  Notification stop;
+  static constexpr int kNumThreads = 8;
+  int64_t sampled_node_count = 0;
+
+  {
+    absl::synchronization_internal::ThreadPool pool(kNumThreads);
+
+    // Run analyzer thread emulating a CordzHandler collection.
+    pool.Schedule([&]() {
+      while (!stop.HasBeenNotified()) {
+        // Run every 10us (about 100K total collections).
+        absl::SleepFor(absl::Microseconds(10));
+        CordzSampleToken token;
+        for (const CordzInfo& cord_info : token) {
+          CordzStatistics stats = cord_info.GetCordzStatistics();
+          sampled_node_count += stats.node_count;
+        }
+      }
+    });
+
+    // Run 'application threads'
+    for (int i = 0; i < kNumThreads; ++i) {
+      pool.Schedule([&]() {
+        // Track 0 - 2 cordz infos at a time, providing permutations of 0, 1
+        // and 2 CordzHandle and CordzInfo queues being active, with plenty of
+        // 'empty to non empty' transitions.
+        InlineData cords[2];
+        std::minstd_rand gen;
+        std::uniform_int_distribution<int> coin_toss(0, 1);
+
+        while (!stop.HasBeenNotified()) {
+          for (InlineData& cord : cords) {
+            // 50/50 flip the state of the cord
+            if (coin_toss(gen) != 0) {
+              if (cord.is_tree()) {
+                // 50/50 simulate delete (untrack) or 'edit to empty'
+                if (coin_toss(gen) != 0) {
+                  CordzInfo::MaybeUntrackCord(cord.cordz_info());
+                } else {
+                  CordzUpdateScope scope(cord.cordz_info(),
+                                         CordzUpdateTracker::kUnknown);
+                  scope.SetCordRep(nullptr);
+                }
+                CordRep::Unref(cord.as_tree());
+                cord.set_inline_size(0);
+              } else {
+                // 50/50 Ring or Flat coin toss
+                CordRep* rep = Flat(256);
+                rep = (coin_toss(gen) != 0) ? CordRepRing::Create(rep) : rep;
+                cord.make_tree(rep);
+
+                // 50/50 sample
+                if (coin_toss(gen) != 0) {
+                  CordzInfo::TrackCord(cord, CordzUpdateTracker::kUnknown);
+                }
+              }
+            }
+          }
+        }
+        for (InlineData& cord : cords) {
+          if (cord.is_tree()) {
+            CordzInfo::MaybeUntrackCord(cord.cordz_info());
+            CordRep::Unref(cord.as_tree());
+          }
+        }
+      });
+    }
+
+    // Run for 1 second to give memory and thread safety analyzers plenty of
+    // time to detect any mishaps or undefined behaviors.
+    absl::SleepFor(absl::Seconds(1));
+    stop.Notify();
+  }
+
+  std::cout << "Sampled " << sampled_node_count << " nodes\n";
+}
+
+}  // namespace
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_info_test.cc
new file mode 100644
index 0000000000..b98343ae79
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info_test.cc
@@ -0,0 +1,341 @@
+// Copyright 2019 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_info.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/cordz_test_helpers.h"
+#include "absl/strings/internal/cord_rep_flat.h"
+#include "absl/strings/internal/cordz_handle.h"
+#include "absl/strings/internal/cordz_statistics.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+// Used test values
+auto constexpr kUnknownMethod = CordzUpdateTracker::kUnknown;
+auto constexpr kTrackCordMethod = CordzUpdateTracker::kConstructorString;
+auto constexpr kChildMethod = CordzUpdateTracker::kConstructorCord;
+auto constexpr kUpdateMethod = CordzUpdateTracker::kAppendString;
+
+// Local less verbose helper
+std::vector<const CordzHandle*> DeleteQueue() {
+  return CordzHandle::DiagnosticsGetDeleteQueue();
+}
+
+std::string FormatStack(absl::Span<void* const> raw_stack) {
+  static constexpr size_t buf_size = 1 << 14;
+  std::unique_ptr<char[]> buf(new char[buf_size]);
+  std::string output;
+  for (void* stackp : raw_stack) {
+    if (absl::Symbolize(stackp, buf.get(), buf_size)) {
+      absl::StrAppend(&output, "    ", buf.get(), "\n");
+    }
+  }
+  return output;
+}
+
+TEST(CordzInfoTest, TrackCord) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+  ASSERT_THAT(info, Ne(nullptr));
+  EXPECT_FALSE(info->is_snapshot());
+  EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(info));
+  EXPECT_THAT(info->GetCordRepForTesting(), Eq(data.rep.rep));
+  info->Untrack();
+}
+
+TEST(CordzInfoTest, MaybeTrackChildCordWithoutSampling) {
+  CordzSamplingIntervalHelper sample_none(99999);
+  TestCordData parent, child;
+  CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod);
+  EXPECT_THAT(child.data.cordz_info(), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, MaybeTrackChildCordWithSampling) {
+  CordzSamplingIntervalHelper sample_all(1);
+  TestCordData parent, child;
+  CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod);
+  EXPECT_THAT(child.data.cordz_info(), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, MaybeTrackChildCordWithoutSamplingParentSampled) {
+  CordzSamplingIntervalHelper sample_none(99999);
+  TestCordData parent, child;
+  CordzInfo::TrackCord(parent.data, kTrackCordMethod);
+  CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod);
+  CordzInfo* parent_info = parent.data.cordz_info();
+  CordzInfo* child_info = child.data.cordz_info();
+  ASSERT_THAT(child_info, Ne(nullptr));
+  EXPECT_THAT(child_info->GetCordRepForTesting(), Eq(child.rep.rep));
+  EXPECT_THAT(child_info->GetParentStack(), parent_info->GetStack());
+  parent_info->Untrack();
+  child_info->Untrack();
+}
+
+TEST(CordzInfoTest, MaybeTrackChildCordWithoutSamplingChildSampled) {
+  CordzSamplingIntervalHelper sample_none(99999);
+  TestCordData parent, child;
+  CordzInfo::TrackCord(child.data, kTrackCordMethod);
+  CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod);
+  EXPECT_THAT(child.data.cordz_info(), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, MaybeTrackChildCordWithSamplingChildSampled) {
+  CordzSamplingIntervalHelper sample_all(1);
+  TestCordData parent, child;
+  CordzInfo::TrackCord(child.data, kTrackCordMethod);
+  CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod);
+  EXPECT_THAT(child.data.cordz_info(), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, UntrackCord) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+
+  info->Untrack();
+  EXPECT_THAT(DeleteQueue(), SizeIs(0));
+}
+
+TEST(CordzInfoTest, UntrackCordWithSnapshot) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+
+  CordzSnapshot snapshot;
+  info->Untrack();
+  EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(nullptr));
+  EXPECT_THAT(info->GetCordRepForTesting(), Eq(data.rep.rep));
+  EXPECT_THAT(DeleteQueue(), ElementsAre(info, &snapshot));
+}
+
+TEST(CordzInfoTest, SetCordRep) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+
+  TestCordRep rep;
+  info->Lock(CordzUpdateTracker::kAppendCord);
+  info->SetCordRep(rep.rep);
+  info->Unlock();
+  EXPECT_THAT(info->GetCordRepForTesting(), Eq(rep.rep));
+
+  info->Untrack();
+}
+
+TEST(CordzInfoTest, SetCordRepNullUntracksCordOnUnlock) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+
+  info->Lock(CordzUpdateTracker::kAppendString);
+  info->SetCordRep(nullptr);
+  EXPECT_THAT(info->GetCordRepForTesting(), Eq(nullptr));
+  EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(info));
+
+  info->Unlock();
+  EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, RefCordRep) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+
+  size_t refcount = data.rep.rep->refcount.Get();
+  EXPECT_THAT(info->RefCordRep(), Eq(data.rep.rep));
+  EXPECT_THAT(data.rep.rep->refcount.Get(), Eq(refcount + 1));
+  CordRep::Unref(data.rep.rep);
+  info->Untrack();
+}
+
+#if GTEST_HAS_DEATH_TEST
+
+TEST(CordzInfoTest, SetCordRepRequiresMutex) {
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+  TestCordRep rep;
+  EXPECT_DEBUG_DEATH(info->SetCordRep(rep.rep), ".*");
+  info->Untrack();
+}
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+TEST(CordzInfoTest, TrackUntrackHeadFirstV2) {
+  CordzSnapshot snapshot;
+  EXPECT_THAT(CordzInfo::Head(snapshot), Eq(nullptr));
+
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info1 = data.data.cordz_info();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info1));
+  EXPECT_THAT(info1->Next(snapshot), Eq(nullptr));
+
+  TestCordData data2;
+  CordzInfo::TrackCord(data2.data, kTrackCordMethod);
+  CordzInfo* info2 = data2.data.cordz_info();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info2));
+  EXPECT_THAT(info2->Next(snapshot), Eq(info1));
+  EXPECT_THAT(info1->Next(snapshot), Eq(nullptr));
+
+  info2->Untrack();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info1));
+  EXPECT_THAT(info1->Next(snapshot), Eq(nullptr));
+
+  info1->Untrack();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, TrackUntrackTailFirstV2) {
+  CordzSnapshot snapshot;
+  EXPECT_THAT(CordzInfo::Head(snapshot), Eq(nullptr));
+
+  TestCordData data;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info1 = data.data.cordz_info();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info1));
+  EXPECT_THAT(info1->Next(snapshot), Eq(nullptr));
+
+  TestCordData data2;
+  CordzInfo::TrackCord(data2.data, kTrackCordMethod);
+  CordzInfo* info2 = data2.data.cordz_info();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info2));
+  EXPECT_THAT(info2->Next(snapshot), Eq(info1));
+  EXPECT_THAT(info1->Next(snapshot), Eq(nullptr));
+
+  info1->Untrack();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info2));
+  EXPECT_THAT(info2->Next(snapshot), Eq(nullptr));
+
+  info2->Untrack();
+  ASSERT_THAT(CordzInfo::Head(snapshot), Eq(nullptr));
+}
+
+TEST(CordzInfoTest, StackV2) {
+  TestCordData data;
+  // kMaxStackDepth is intentionally less than 64 (which is the max depth that
+  // Cordz will record) because if the actual stack depth is over 64
+  // (which it is on Apple platforms) then the expected_stack will end up
+  // catching a few frames at the end that the actual_stack didn't get and
+  // it will no longer be subset. At the time of this writing 58 is the max
+  // that will allow this test to pass (with a minimum os version of iOS 9), so
+  // rounded down to 50 to hopefully not run into this in the future if Apple
+  // makes small modifications to its testing stack. 50 is sufficient to prove
+  // that we got a decent stack.
+  static constexpr int kMaxStackDepth = 50;
+  CordzInfo::TrackCord(data.data, kTrackCordMethod);
+  CordzInfo* info = data.data.cordz_info();
+  std::vector<void*> local_stack;
+  local_stack.resize(kMaxStackDepth);
+  // In some environments we don't get stack traces. For example in Android
+  // absl::GetStackTrace will return 0 indicating it didn't find any stack. The
+  // resultant formatted stack will be "", but that still equals the stack
+  // recorded in CordzInfo, which is also empty. The skip_count is 1 so that the
+  // line number of the current stack isn't included in the HasSubstr check.
+  local_stack.resize(absl::GetStackTrace(local_stack.data(), kMaxStackDepth,
+                                         /*skip_count=*/1));
+
+  std::string got_stack = FormatStack(info->GetStack());
+  std::string expected_stack = FormatStack(local_stack);
+  // If TrackCord is inlined, got_stack should match expected_stack. If it isn't
+  // inlined, got_stack should include an additional frame not present in
+  // expected_stack. Either way, expected_stack should be a substring of
+  // got_stack.
+  EXPECT_THAT(got_stack, HasSubstr(expected_stack));
+
+  info->Untrack();
+}
+
+// Local helper functions to get different stacks for child and parent.
+CordzInfo* TrackChildCord(InlineData& data, const InlineData& parent) {
+  CordzInfo::TrackCord(data, parent, kChildMethod);
+  return data.cordz_info();
+}
+CordzInfo* TrackParentCord(InlineData& data) {
+  CordzInfo::TrackCord(data, kTrackCordMethod);
+  return data.cordz_info();
+}
+
+TEST(CordzInfoTest, GetStatistics) {
+  TestCordData data;
+  CordzInfo* info = TrackParentCord(data.data);
+
+  CordzStatistics statistics = info->GetCordzStatistics();
+  EXPECT_THAT(statistics.size, Eq(data.rep.rep->length));
+  EXPECT_THAT(statistics.method, Eq(kTrackCordMethod));
+  EXPECT_THAT(statistics.parent_method, Eq(kUnknownMethod));
+  EXPECT_THAT(statistics.update_tracker.Value(kTrackCordMethod), Eq(1));
+
+  info->Untrack();
+}
+
+TEST(CordzInfoTest, LockCountsMethod) {
+  TestCordData data;
+  CordzInfo* info = TrackParentCord(data.data);
+
+  info->Lock(kUpdateMethod);
+  info->Unlock();
+  info->Lock(kUpdateMethod);
+  info->Unlock();
+
+  CordzStatistics statistics = info->GetCordzStatistics();
+  EXPECT_THAT(statistics.update_tracker.Value(kUpdateMethod), Eq(2));
+
+  info->Untrack();
+}
+
+TEST(CordzInfoTest, FromParent) {
+  TestCordData parent;
+  TestCordData child;
+  CordzInfo* info_parent = TrackParentCord(parent.data);
+  CordzInfo* info_child = TrackChildCord(child.data, parent.data);
+
+  std::string stack = FormatStack(info_parent->GetStack());
+  std::string parent_stack = FormatStack(info_child->GetParentStack());
+  EXPECT_THAT(stack, Eq(parent_stack));
+
+  CordzStatistics statistics = info_child->GetCordzStatistics();
+  EXPECT_THAT(statistics.size, Eq(child.rep.rep->length));
+  EXPECT_THAT(statistics.method, Eq(kChildMethod));
+  EXPECT_THAT(statistics.parent_method, Eq(kTrackCordMethod));
+  EXPECT_THAT(statistics.update_tracker.Value(kChildMethod), Eq(1));
+
+  info_parent->Untrack();
+  info_child->Untrack();
+}
+
+}  // namespace
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.cc
new file mode 100644
index 0000000000..ba1270d8f0
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.cc
@@ -0,0 +1,64 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_sample_token.h"
+
+#include "absl/base/config.h"
+#include "absl/strings/internal/cordz_handle.h"
+#include "absl/strings/internal/cordz_info.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+CordzSampleToken::Iterator& CordzSampleToken::Iterator::operator++() {
+  if (current_) {
+    current_ = current_->Next(*token_);
+  }
+  return *this;
+}
+
+CordzSampleToken::Iterator CordzSampleToken::Iterator::operator++(int) {
+  Iterator it(*this);
+  operator++();
+  return it;
+}
+
+bool operator==(const CordzSampleToken::Iterator& lhs,
+                const CordzSampleToken::Iterator& rhs) {
+  return lhs.current_ == rhs.current_ &&
+         (lhs.current_ == nullptr || lhs.token_ == rhs.token_);
+}
+
+bool operator!=(const CordzSampleToken::Iterator& lhs,
+                const CordzSampleToken::Iterator& rhs) {
+  return !(lhs == rhs);
+}
+
+CordzSampleToken::Iterator::reference CordzSampleToken::Iterator::operator*()
+    const {
+  return *current_;
+}
+
+CordzSampleToken::Iterator::pointer CordzSampleToken::Iterator::operator->()
+    const {
+  return current_;
+}
+
+CordzSampleToken::Iterator::Iterator(const CordzSampleToken* token)
+    : token_(token), current_(CordzInfo::Head(*token)) {}
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.h b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.h
new file mode 100644
index 0000000000..28a1d70ccc
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.h
@@ -0,0 +1,97 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/base/config.h"
+#include "absl/strings/internal/cordz_handle.h"
+#include "absl/strings/internal/cordz_info.h"
+
+#ifndef ABSL_STRINGS_CORDZ_SAMPLE_TOKEN_H_
+#define ABSL_STRINGS_CORDZ_SAMPLE_TOKEN_H_
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// The existence of a CordzSampleToken guarantees that a reader can traverse the
+// global_cordz_infos_head linked-list without needing to hold a mutex. When a
+// CordzSampleToken exists, all CordzInfo objects that would be destroyed are
+// instead appended to a deletion queue. When the CordzSampleToken is destroyed,
+// it will also clean up any of these CordzInfo objects.
+//
+// E.g., ST are CordzSampleToken objects and CH are CordzHandle objects.
+//   ST1 <- CH1 <- CH2 <- ST2 <- CH3 <- global_delete_queue_tail
+//
+// This list tracks that CH1 and CH2 were created after ST1, so the thread
+// holding ST1 might have a referece to CH1, CH2, ST2, and CH3. However, ST2 was
+// created later, so the thread holding the ST2 token cannot have a reference to
+// ST1, CH1, or CH2. If ST1 is cleaned up first, that thread will delete ST1,
+// CH1, and CH2. If instead ST2 is cleaned up first, that thread will only
+// delete ST2.
+//
+// If ST1 is cleaned up first, the new list will be:
+//   ST2 <- CH3 <- global_delete_queue_tail
+//
+// If ST2 is cleaned up first, the new list will be:
+//   ST1 <- CH1 <- CH2 <- CH3 <- global_delete_queue_tail
+//
+// All new CordzHandle objects are appended to the list, so if a new thread
+// comes along before either ST1 or ST2 are cleaned up, the new list will be:
+//   ST1 <- CH1 <- CH2 <- ST2 <- CH3 <- ST3 <- global_delete_queue_tail
+//
+// A thread must hold the global_delete_queue_mu mutex whenever it's altering
+// this list.
+//
+// It is safe for thread that holds a CordzSampleToken to read
+// global_cordz_infos at any time since the objects it is able to retrieve will
+// not be deleted while the CordzSampleToken exists.
+class CordzSampleToken : public CordzSnapshot {
+ public:
+  class Iterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = const CordzInfo&;
+    using difference_type = ptrdiff_t;
+    using pointer = const CordzInfo*;
+    using reference = value_type;
+
+    Iterator() = default;
+
+    Iterator& operator++();
+    Iterator operator++(int);
+    friend bool operator==(const Iterator& lhs, const Iterator& rhs);
+    friend bool operator!=(const Iterator& lhs, const Iterator& rhs);
+    reference operator*() const;
+    pointer operator->() const;
+
+   private:
+    friend class CordzSampleToken;
+    explicit Iterator(const CordzSampleToken* token);
+
+    const CordzSampleToken* token_ = nullptr;
+    pointer current_ = nullptr;
+  };
+
+  CordzSampleToken() = default;
+  CordzSampleToken(const CordzSampleToken&) = delete;
+  CordzSampleToken& operator=(const CordzSampleToken&) = delete;
+
+  Iterator begin() { return Iterator(this); }
+  Iterator end() { return Iterator(); }
+};
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_CORDZ_SAMPLE_TOKEN_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token_test.cc
new file mode 100644
index 0000000000..9f54301d68
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token_test.cc
@@ -0,0 +1,208 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_sample_token.h"
+
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/random/random.h"
+#include "absl/strings/cordz_test_helpers.h"
+#include "absl/strings/internal/cord_rep_flat.h"
+#include "absl/strings/internal/cordz_handle.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/synchronization/internal/thread_pool.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Ne;
+
+// Used test values
+auto constexpr kTrackCordMethod = CordzUpdateTracker::kConstructorString;
+
+TEST(CordzSampleTokenTest, IteratorTraits) {
+  static_assert(std::is_copy_constructible<CordzSampleToken::Iterator>::value,
+                "");
+  static_assert(std::is_copy_assignable<CordzSampleToken::Iterator>::value, "");
+  static_assert(std::is_move_constructible<CordzSampleToken::Iterator>::value,
+                "");
+  static_assert(std::is_move_assignable<CordzSampleToken::Iterator>::value, "");
+  static_assert(
+      std::is_same<
+          std::iterator_traits<CordzSampleToken::Iterator>::iterator_category,
+          std::input_iterator_tag>::value,
+      "");
+  static_assert(
+      std::is_same<std::iterator_traits<CordzSampleToken::Iterator>::value_type,
+                   const CordzInfo&>::value,
+      "");
+  static_assert(
+      std::is_same<
+          std::iterator_traits<CordzSampleToken::Iterator>::difference_type,
+          ptrdiff_t>::value,
+      "");
+  static_assert(
+      std::is_same<std::iterator_traits<CordzSampleToken::Iterator>::pointer,
+                   const CordzInfo*>::value,
+      "");
+  static_assert(
+      std::is_same<std::iterator_traits<CordzSampleToken::Iterator>::reference,
+                   const CordzInfo&>::value,
+      "");
+}
+
+TEST(CordzSampleTokenTest, IteratorEmpty) {
+  CordzSampleToken token;
+  EXPECT_THAT(token.begin(), Eq(token.end()));
+}
+
+TEST(CordzSampleTokenTest, Iterator) {
+  TestCordData cord1, cord2, cord3;
+  CordzInfo::TrackCord(cord1.data, kTrackCordMethod);
+  CordzInfo* info1 = cord1.data.cordz_info();
+  CordzInfo::TrackCord(cord2.data, kTrackCordMethod);
+  CordzInfo* info2 = cord2.data.cordz_info();
+  CordzInfo::TrackCord(cord3.data, kTrackCordMethod);
+  CordzInfo* info3 = cord3.data.cordz_info();
+
+  CordzSampleToken token;
+  std::vector<const CordzInfo*> found;
+  for (const CordzInfo& cord_info : token) {
+    found.push_back(&cord_info);
+  }
+
+  EXPECT_THAT(found, ElementsAre(info3, info2, info1));
+
+  info1->Untrack();
+  info2->Untrack();
+  info3->Untrack();
+}
+
+TEST(CordzSampleTokenTest, IteratorEquality) {
+  TestCordData cord1;
+  TestCordData cord2;
+  TestCordData cord3;
+  CordzInfo::TrackCord(cord1.data, kTrackCordMethod);
+  CordzInfo* info1 = cord1.data.cordz_info();
+
+  CordzSampleToken token1;
+  // lhs starts with the CordzInfo corresponding to cord1 at the head.
+  CordzSampleToken::Iterator lhs = token1.begin();
+
+  CordzInfo::TrackCord(cord2.data, kTrackCordMethod);
+  CordzInfo* info2 = cord2.data.cordz_info();
+
+  CordzSampleToken token2;
+  // rhs starts with the CordzInfo corresponding to cord2 at the head.
+  CordzSampleToken::Iterator rhs = token2.begin();
+
+  CordzInfo::TrackCord(cord3.data, kTrackCordMethod);
+  CordzInfo* info3 = cord3.data.cordz_info();
+
+  // lhs is on cord1 while rhs is on cord2.
+  EXPECT_THAT(lhs, Ne(rhs));
+
+  rhs++;
+  // lhs and rhs are both on cord1, but they didn't come from the same
+  // CordzSampleToken.
+  EXPECT_THAT(lhs, Ne(rhs));
+
+  lhs++;
+  rhs++;
+  // Both lhs and rhs are done, so they are on nullptr.
+  EXPECT_THAT(lhs, Eq(rhs));
+
+  info1->Untrack();
+  info2->Untrack();
+  info3->Untrack();
+}
+
+TEST(CordzSampleTokenTest, MultiThreaded) {
+  Notification stop;
+  static constexpr int kNumThreads = 4;
+  static constexpr int kNumCords = 3;
+  static constexpr int kNumTokens = 3;
+  absl::synchronization_internal::ThreadPool pool(kNumThreads);
+
+  for (int i = 0; i < kNumThreads; ++i) {
+    pool.Schedule([&stop]() {
+      absl::BitGen gen;
+      TestCordData cords[kNumCords];
+      std::unique_ptr<CordzSampleToken> tokens[kNumTokens];
+
+      while (!stop.HasBeenNotified()) {
+        // Randomly perform one of five actions:
+        //   1) Untrack
+        //   2) Track
+        //   3) Iterate over Cords visible to a token.
+        //   4) Unsample
+        //   5) Sample
+        int index = absl::Uniform(gen, 0, kNumCords);
+        if (absl::Bernoulli(gen, 0.5)) {
+          TestCordData& cord = cords[index];
+          // Track/untrack.
+          if (cord.data.is_profiled()) {
+            // 1) Untrack
+            cord.data.cordz_info()->Untrack();
+            cord.data.clear_cordz_info();;
+          } else {
+            // 2) Track
+            CordzInfo::TrackCord(cord.data, kTrackCordMethod);
+          }
+        } else {
+          std::unique_ptr<CordzSampleToken>& token = tokens[index];
+          if (token) {
+            if (absl::Bernoulli(gen, 0.5)) {
+              // 3) Iterate over Cords visible to a token.
+              for (const CordzInfo& info : *token) {
+                // This is trivial work to allow us to compile the loop.
+                EXPECT_THAT(info.Next(*token), Ne(&info));
+              }
+            } else {
+              // 4) Unsample
+              token = nullptr;
+            }
+          } else {
+            // 5) Sample
+            token = absl::make_unique<CordzSampleToken>();
+          }
+        }
+      }
+      for (TestCordData& cord : cords) {
+        CordzInfo::MaybeUntrackCord(cord.data.cordz_info());
+      }
+    });
+  }
+  // The threads will hammer away.  Give it a little bit of time for tsan to
+  // spot errors.
+  absl::SleepFor(absl::Seconds(3));
+  stop.Notify();
+}
+
+}  // namespace
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_statistics.h b/third_party/abseil-cpp/absl/strings/internal/cordz_statistics.h
new file mode 100644
index 0000000000..e03c651e9c
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_statistics.h
@@ -0,0 +1,84 @@
+// Copyright 2019 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_INTERNAL_CORDZ_STATISTICS_H_
+#define ABSL_STRINGS_INTERNAL_CORDZ_STATISTICS_H_
+
+#include <cstdint>
+
+#include "absl/base/config.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// CordzStatistics captures some meta information about a Cord's shape.
+struct CordzStatistics {
+  using MethodIdentifier = CordzUpdateTracker::MethodIdentifier;
+
+  // Node counts information
+  struct NodeCounts {
+    size_t flat = 0;       // #flats
+    size_t flat_64 = 0;    // #flats up to 64 bytes
+    size_t flat_128 = 0;   // #flats up to 128 bytes
+    size_t flat_256 = 0;   // #flats up to 256 bytes
+    size_t flat_512 = 0;   // #flats up to 512 bytes
+    size_t flat_1k = 0;    // #flats up to 1K bytes
+    size_t external = 0;   // #external reps
+    size_t substring = 0;  // #substring reps
+    size_t concat = 0;     // #concat reps
+    size_t ring = 0;       // #ring buffer reps
+  };
+
+  // The size of the cord in bytes. This matches the result of Cord::size().
+  int64_t size = 0;
+
+  // The estimated memory used by the sampled cord. This value matches the
+  // value as reported by Cord::EstimatedMemoryUsage().
+  // A value of 0 implies the property has not been recorded.
+  int64_t estimated_memory_usage = 0;
+
+  // The effective memory used by the sampled cord, inversely weighted by the
+  // effective indegree of each allocated node. This is a representation of the
+  // fair share of memory usage that should be attributed to the sampled cord.
+  // This value is more useful for cases where one or more nodes are referenced
+  // by multiple Cord instances, and for cases where a Cord includes the same
+  // node multiple times (either directly or indirectly).
+  // A value of 0 implies the property has not been recorded.
+  int64_t estimated_fair_share_memory_usage = 0;
+
+  // The total number of nodes referenced by this cord.
+  // For ring buffer Cords, this includes the 'ring buffer' node.
+  // A value of 0 implies the property has not been recorded.
+  int64_t node_count = 0;
+
+  // Detailed node counts per type
+  NodeCounts node_counts;
+
+  // The cord method responsible for sampling the cord.
+  MethodIdentifier method = MethodIdentifier::kUnknown;
+
+  // The cord method responsible for sampling the parent cord if applicable.
+  MethodIdentifier parent_method = MethodIdentifier::kUnknown;
+
+  // Update tracker tracking invocation count per cord method.
+  CordzUpdateTracker update_tracker;
+};
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_INTERNAL_CORDZ_STATISTICS_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope.h b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope.h
new file mode 100644
index 0000000000..57ba75de93
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope.h
@@ -0,0 +1,71 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_SCOPE_H_
+#define ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_SCOPE_H_
+
+#include "absl/base/config.h"
+#include "absl/base/optimization.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/internal/cord_internal.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// CordzUpdateScope scopes an update to the provided CordzInfo.
+// The class invokes `info->Lock(method)` and `info->Unlock()` to guard
+// cordrep updates. This class does nothing if `info` is null.
+// See also the 'Lock`, `Unlock` and `SetCordRep` methods in `CordzInfo`.
+class ABSL_SCOPED_LOCKABLE CordzUpdateScope {
+ public:
+  CordzUpdateScope(CordzInfo* info, CordzUpdateTracker::MethodIdentifier method)
+      ABSL_EXCLUSIVE_LOCK_FUNCTION(info)
+      : info_(info) {
+    if (ABSL_PREDICT_FALSE(info_)) {
+      info->Lock(method);
+    }
+  }
+
+  // CordzUpdateScope can not be copied or assigned to.
+  CordzUpdateScope(CordzUpdateScope&& rhs) = delete;
+  CordzUpdateScope(const CordzUpdateScope&) = delete;
+  CordzUpdateScope& operator=(CordzUpdateScope&& rhs) = delete;
+  CordzUpdateScope& operator=(const CordzUpdateScope&) = delete;
+
+  ~CordzUpdateScope() ABSL_UNLOCK_FUNCTION() {
+    if (ABSL_PREDICT_FALSE(info_)) {
+      info_->Unlock();
+    }
+  }
+
+  void SetCordRep(CordRep* rep) const {
+    if (ABSL_PREDICT_FALSE(info_)) {
+      info_->SetCordRep(rep);
+    }
+  }
+
+  CordzInfo* info() const { return info_; }
+
+ private:
+  CordzInfo* info_;
+};
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_SCOPE_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope_test.cc
new file mode 100644
index 0000000000..3d08c622d0
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope_test.cc
@@ -0,0 +1,49 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_update_scope.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/config.h"
+#include "absl/strings/cordz_test_helpers.h"
+#include "absl/strings/internal/cord_rep_flat.h"
+#include "absl/strings/internal/cordz_info.h"
+#include "absl/strings/internal/cordz_update_tracker.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+namespace {
+
+// Used test values
+auto constexpr kTrackCordMethod = CordzUpdateTracker::kConstructorString;
+
+TEST(CordzUpdateScopeTest, ScopeNullptr) {
+  CordzUpdateScope scope(nullptr, kTrackCordMethod);
+}
+
+TEST(CordzUpdateScopeTest, ScopeSampledCord) {
+  TestCordData cord;
+  CordzInfo::TrackCord(cord.data, kTrackCordMethod);
+  CordzUpdateScope scope(cord.data.cordz_info(), kTrackCordMethod);
+  cord.data.cordz_info()->SetCordRep(nullptr);
+}
+
+}  // namespace
+ABSL_NAMESPACE_END
+}  // namespace cord_internal
+
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker.h b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker.h
new file mode 100644
index 0000000000..02efcc3a2d
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker.h
@@ -0,0 +1,119 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_TRACKER_H_
+#define ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_TRACKER_H_
+
+#include <atomic>
+#include <cstdint>
+
+#include "absl/base/config.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+
+// CordzUpdateTracker tracks counters for Cord update methods.
+//
+// The purpose of CordzUpdateTracker is to track the number of calls to methods
+// updating Cord data for sampled cords. The class internally uses 'lossy'
+// atomic operations: Cord is thread-compatible, so there is no need to
+// synchronize updates. However, Cordz collection threads may call 'Value()' at
+// any point, so the class needs to provide thread safe access.
+//
+// This class is thread-safe. But as per above comments, all non-const methods
+// should be used single-threaded only: updates are thread-safe but lossy.
+class CordzUpdateTracker {
+ public:
+  // Tracked update methods.
+  enum MethodIdentifier {
+    kUnknown,
+    kAppendCord,
+    kAppendExternalMemory,
+    kAppendString,
+    kAssignCord,
+    kAssignString,
+    kClear,
+    kConstructorCord,
+    kConstructorString,
+    kCordReader,
+    kFlatten,
+    kGetAppendRegion,
+    kMakeCordFromExternal,
+    kMoveAppendCord,
+    kMoveAssignCord,
+    kMovePrependCord,
+    kPrependCord,
+    kPrependString,
+    kRemovePrefix,
+    kRemoveSuffix,
+    kSubCord,
+
+    // kNumMethods defines the number of entries: must be the last entry.
+    kNumMethods,
+  };
+
+  // Constructs a new instance. All counters are zero-initialized.
+  constexpr CordzUpdateTracker() noexcept : values_{} {}
+
+  // Copy constructs a new instance.
+  CordzUpdateTracker(const CordzUpdateTracker& rhs) noexcept { *this = rhs; }
+
+  // Assigns the provided value to this instance.
+  CordzUpdateTracker& operator=(const CordzUpdateTracker& rhs) noexcept {
+    for (int i = 0; i < kNumMethods; ++i) {
+      values_[i].store(rhs.values_[i].load(std::memory_order_relaxed),
+                       std::memory_order_relaxed);
+    }
+    return *this;
+  }
+
+  // Returns the value for the specified method.
+  int64_t Value(MethodIdentifier method) const {
+    return values_[method].load(std::memory_order_relaxed);
+  }
+
+  // Increases the value for the specified method by `n`
+  void LossyAdd(MethodIdentifier method, int64_t n = 1) {
+    auto& value = values_[method];
+    value.store(value.load(std::memory_order_relaxed) + n,
+                std::memory_order_relaxed);
+  }
+
+  // Adds all the values from `src` to this instance
+  void LossyAdd(const CordzUpdateTracker& src) {
+    for (int i = 0; i < kNumMethods; ++i) {
+      MethodIdentifier method = static_cast<MethodIdentifier>(i);
+      if (int64_t value = src.Value(method)) {
+        LossyAdd(method, value);
+      }
+    }
+  }
+
+ private:
+  // Until C++20 std::atomic is not constexpr default-constructible, so we need
+  // a wrapper for this class to be constexpr constructible.
+  class Counter : public std::atomic<int64_t> {
+   public:
+    constexpr Counter() noexcept : std::atomic<int64_t>(0) {}
+  };
+
+  Counter values_[kNumMethods];
+};
+
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_TRACKER_H_
diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker_test.cc
new file mode 100644
index 0000000000..fcd17df7a0
--- /dev/null
+++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker_test.cc
@@ -0,0 +1,143 @@
+// Copyright 2021 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/internal/cordz_update_tracker.h"
+
+#include <array>
+#include <thread>  // NOLINT
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/attributes.h"
+#include "absl/base/config.h"
+#include "absl/synchronization/notification.h"
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+namespace cord_internal {
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+using Method = CordzUpdateTracker::MethodIdentifier;
+using Methods = std::array<Method, Method::kNumMethods>;
+
+// Returns an array of all methods defined in `MethodIdentifier`
+Methods AllMethods() {
+  return Methods{Method::kUnknown,
+                 Method::kAppendCord,
+                 Method::kAppendExternalMemory,
+                 Method::kAppendString,
+                 Method::kAssignCord,
+                 Method::kAssignString,
+                 Method::kClear,
+                 Method::kConstructorCord,
+                 Method::kConstructorString,
+                 Method::kCordReader,
+                 Method::kFlatten,
+                 Method::kGetAppendRegion,
+                 Method::kMakeCordFromExternal,
+                 Method::kMoveAppendCord,
+                 Method::kMoveAssignCord,
+                 Method::kMovePrependCord,
+                 Method::kPrependCord,
+                 Method::kPrependString,
+                 Method::kRemovePrefix,
+                 Method::kRemoveSuffix,
+                 Method::kSubCord};
+}
+
+TEST(CordzUpdateTracker, IsConstExprAndInitializesToZero) {
+  constexpr CordzUpdateTracker tracker;
+  for (Method method : AllMethods()) {
+    ASSERT_THAT(tracker.Value(method), Eq(0));
+  }
+}
+
+TEST(CordzUpdateTracker, LossyAdd) {
+  int64_t n = 1;
+  CordzUpdateTracker tracker;
+  for (Method method : AllMethods()) {
+    tracker.LossyAdd(method, n);
+    EXPECT_THAT(tracker.Value(method), Eq(n));
+    n += 2;
+  }
+}
+
+TEST(CordzUpdateTracker, CopyConstructor) {
+  int64_t n = 1;
+  CordzUpdateTracker src;
+  for (Method method : AllMethods()) {
+    src.LossyAdd(method, n);
+    n += 2;
+  }
+
+  n = 1;
+  CordzUpdateTracker tracker(src);
+  for (Method method : AllMethods()) {
+    EXPECT_THAT(tracker.Value(method), Eq(n));
+    n += 2;
+  }
+}
+
+TEST(CordzUpdateTracker, OperatorAssign) {
+  int64_t n = 1;
+  CordzUpdateTracker src;
+  CordzUpdateTracker tracker;
+  for (Method method : AllMethods()) {
+    src.LossyAdd(method, n);
+    n += 2;
+  }
+
+  n = 1;
+  tracker = src;
+  for (Method method : AllMethods()) {
+    EXPECT_THAT(tracker.Value(method), Eq(n));
+    n += 2;
+  }
+}
+
+TEST(CordzUpdateTracker, ThreadSanitizedValueCheck) {
+  absl::Notification done;
+  CordzUpdateTracker tracker;
+
+  std::thread reader([&done, &tracker] {
+    while (!done.HasBeenNotified()) {
+      int n = 1;
+      for (Method method : AllMethods()) {
+        EXPECT_THAT(tracker.Value(method), AnyOf(Eq(n), Eq(0)));
+        n += 2;
+      }
+    }
+    int n = 1;
+    for (Method method : AllMethods()) {
+      EXPECT_THAT(tracker.Value(method), Eq(n));
+      n += 2;
+    }
+  });
+
+  int64_t n = 1;
+  for (Method method : AllMethods()) {
+    tracker.LossyAdd(method, n);
+    n += 2;
+  }
+  done.Notify();
+  reader.join();
+}
+
+}  // namespace
+}  // namespace cord_internal
+ABSL_NAMESPACE_END
+}  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h
index e42628e394..749c66e78e 100644
--- a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h
+++ b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h
@@ -17,6 +17,7 @@
 #ifndef ABSL_STRINGS_INTERNAL_RESIZE_UNINITIALIZED_H_
 #define ABSL_STRINGS_INTERNAL_RESIZE_UNINITIALIZED_H_
 
+#include <algorithm>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -66,6 +67,28 @@ inline void STLStringResizeUninitialized(string_type* s, size_t new_size) {
   ResizeUninitializedTraits<string_type>::Resize(s, new_size);
 }
 
+// Used to ensure exponential growth so that the amortized complexity of
+// increasing the string size by a small amount is O(1), in contrast to
+// O(str->size()) in the case of precise growth.
+template <typename string_type>
+void STLStringReserveAmortized(string_type* s, size_t new_size) {
+  const size_t cap = s->capacity();
+  if (new_size > cap) {
+    // Make sure to always grow by at least a factor of 2x.
+    s->reserve((std::max)(new_size, 2 * cap));
+  }
+}
+
+// Like STLStringResizeUninitialized(str, new_size), except guaranteed to use
+// exponential growth so that the amortized complexity of increasing the string
+// size by a small amount is O(1), in contrast to O(str->size()) in the case of
+// precise growth.
+template <typename string_type>
+void STLStringResizeUninitializedAmortized(string_type* s, size_t new_size) {
+  STLStringReserveAmortized(s, new_size);
+  STLStringResizeUninitialized(s, new_size);
+}
+
 }  // namespace strings_internal
 ABSL_NAMESPACE_END
 }  // namespace absl
diff --git a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc
index 0f8b3c2a95..01ee476b6c 100644
--- a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc
@@ -24,11 +24,13 @@ int resize_call_count = 0;
 // resize() method has been called.
 struct resizable_string {
   size_t size() const { return 0; }
+  size_t capacity() const { return 0; }
   char& operator[](size_t) {
     static char c = '\0';
     return c;
   }
   void resize(size_t) { resize_call_count += 1; }
+  void reserve(size_t) {}
 };
 
 int resize_default_init_call_count = 0;
@@ -37,12 +39,14 @@ int resize_default_init_call_count = 0;
 // resize() and __resize_default_init() methods have been called.
 struct resize_default_init_string {
   size_t size() const { return 0; }
+  size_t capacity() const { return 0; }
   char& operator[](size_t) {
     static char c = '\0';
     return c;
   }
   void resize(size_t) { resize_call_count += 1; }
   void __resize_default_init(size_t) { resize_default_init_call_count += 1; }
+  void reserve(size_t) {}
 };
 
 TEST(ResizeUninit, WithAndWithout) {
@@ -60,6 +64,9 @@ TEST(ResizeUninit, WithAndWithout) {
     absl::strings_internal::STLStringResizeUninitialized(&rs, 237);
     EXPECT_EQ(resize_call_count, 1);
     EXPECT_EQ(resize_default_init_call_count, 0);
+    absl::strings_internal::STLStringResizeUninitializedAmortized(&rs, 1000);
+    EXPECT_EQ(resize_call_count, 2);
+    EXPECT_EQ(resize_default_init_call_count, 0);
   }
 
   resize_call_count = 0;
@@ -76,7 +83,23 @@ TEST(ResizeUninit, WithAndWithout) {
     absl::strings_internal::STLStringResizeUninitialized(&rus, 237);
     EXPECT_EQ(resize_call_count, 0);
     EXPECT_EQ(resize_default_init_call_count, 1);
+    absl::strings_internal::STLStringResizeUninitializedAmortized(&rus, 1000);
+    EXPECT_EQ(resize_call_count, 0);
+    EXPECT_EQ(resize_default_init_call_count, 2);
+  }
+}
+
+TEST(ResizeUninit, Amortized) {
+  std::string str;
+  size_t prev_cap = str.capacity();
+  int cap_increase_count = 0;
+  for (int i = 0; i < 1000; ++i) {
+    absl::strings_internal::STLStringResizeUninitializedAmortized(&str, i);
+    size_t new_cap = str.capacity();
+    if (new_cap > prev_cap) ++cap_increase_count;
+    prev_cap = new_cap;
   }
+  EXPECT_LT(cap_increase_count, 50);
 }
 
 }  // namespace
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h b/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h
index 7040c86677..3c91be701f 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h
@@ -122,6 +122,14 @@ StringConvertResult FormatConvertImpl(const std::string& v,
 StringConvertResult FormatConvertImpl(string_view v,
                                       FormatConversionSpecImpl conv,
                                       FormatSinkImpl* sink);
+#if defined(ABSL_HAVE_STD_STRING_VIEW) && !defined(ABSL_USES_STD_STRING_VIEW)
+inline StringConvertResult FormatConvertImpl(std::string_view v,
+                                             FormatConversionSpecImpl conv,
+                                             FormatSinkImpl* sink) {
+  return FormatConvertImpl(absl::string_view(v.data(), v.size()), conv, sink);
+}
+#endif  // ABSL_HAVE_STD_STRING_VIEW && !ABSL_USES_STD_STRING_VIEW
+
 ArgConvertResult<FormatConversionCharSetUnion(
     FormatConversionCharSetInternal::s, FormatConversionCharSetInternal::p)>
 FormatConvertImpl(const char* v, const FormatConversionSpecImpl conv,
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc
index 4e68b90b5c..c988ba8fd2 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc
@@ -58,7 +58,7 @@ inline bool ArgContext::Bind(const UnboundConversion* unbound,
   if (static_cast<size_t>(arg_position - 1) >= pack_.size()) return false;
   arg = &pack_[arg_position - 1];  // 1-based
 
-  if (!unbound->flags.basic) {
+  if (unbound->flags != Flags::kBasic) {
     int width = unbound->width.value();
     bool force_left = false;
     if (unbound->width.is_from_arg()) {
@@ -84,9 +84,8 @@ inline bool ArgContext::Bind(const UnboundConversion* unbound,
     FormatConversionSpecImplFriend::SetPrecision(precision, bound);
 
     if (force_left) {
-      Flags flags = unbound->flags;
-      flags.left = true;
-      FormatConversionSpecImplFriend::SetFlags(flags, bound);
+      FormatConversionSpecImplFriend::SetFlags(unbound->flags | Flags::kLeft,
+                                               bound);
     } else {
       FormatConversionSpecImplFriend::SetFlags(unbound->flags, bound);
     }
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc
index 926283cfac..91e0360901 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc
@@ -229,6 +229,9 @@ TEST_F(FormatConvertTest, BasicString) {
   TestStringConvert(static_cast<const char*>("hello"));
   TestStringConvert(std::string("hello"));
   TestStringConvert(string_view("hello"));
+#if defined(ABSL_HAVE_STD_STRING_VIEW)
+  TestStringConvert(std::string_view("hello"));
+#endif  // ABSL_HAVE_STD_STRING_VIEW
 }
 
 TEST_F(FormatConvertTest, NullString) {
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc
index bb0d96cf32..484f6ebfc1 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc
@@ -23,13 +23,13 @@ namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace str_format_internal {
 
-std::string Flags::ToString() const {
+std::string FlagsToString(Flags v) {
   std::string s;
-  s.append(left     ? "-" : "");
-  s.append(show_pos ? "+" : "");
-  s.append(sign_col ? " " : "");
-  s.append(alt      ? "#" : "");
-  s.append(zero     ? "0" : "");
+  s.append(FlagsContains(v, Flags::kLeft) ? "-" : "");
+  s.append(FlagsContains(v, Flags::kShowPos) ? "+" : "");
+  s.append(FlagsContains(v, Flags::kSignCol) ? " " : "");
+  s.append(FlagsContains(v, Flags::kAlt) ? "#" : "");
+  s.append(FlagsContains(v, Flags::kZero) ? "0" : "");
   return s;
 }
 
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h
index a9b9e137de..55cbb56d0a 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h
@@ -128,19 +128,33 @@ class FormatSinkImpl {
   char buf_[1024];
 };
 
-struct Flags {
-  bool basic : 1;     // fastest conversion: no flags, width, or precision
-  bool left : 1;      // "-"
-  bool show_pos : 1;  // "+"
-  bool sign_col : 1;  // " "
-  bool alt : 1;       // "#"
-  bool zero : 1;      // "0"
-  std::string ToString() const;
-  friend std::ostream& operator<<(std::ostream& os, const Flags& v) {
-    return os << v.ToString();
-  }
+enum class Flags : uint8_t {
+  kBasic = 0,
+  kLeft = 1 << 0,
+  kShowPos = 1 << 1,
+  kSignCol = 1 << 2,
+  kAlt = 1 << 3,
+  kZero = 1 << 4,
+  // This is not a real flag. It just exists to turn off kBasic when no other
+  // flags are set. This is for when width/precision are specified.
+  kNonBasic = 1 << 5,
 };
 
+constexpr Flags operator|(Flags a, Flags b) {
+  return static_cast<Flags>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
+}
+
+constexpr bool FlagsContains(Flags haystack, Flags needle) {
+  return (static_cast<uint8_t>(haystack) & static_cast<uint8_t>(needle)) ==
+         static_cast<uint8_t>(needle);
+}
+
+std::string FlagsToString(Flags v);
+
+inline std::ostream& operator<<(std::ostream& os, Flags v) {
+  return os << FlagsToString(v);
+}
+
 // clang-format off
 #define ABSL_INTERNAL_CONVERSION_CHARS_EXPAND_(X_VAL, X_SEP) \
   /* text */ \
@@ -257,12 +271,16 @@ struct FormatConversionSpecImplFriend;
 class FormatConversionSpecImpl {
  public:
   // Width and precison are not specified, no flags are set.
-  bool is_basic() const { return flags_.basic; }
-  bool has_left_flag() const { return flags_.left; }
-  bool has_show_pos_flag() const { return flags_.show_pos; }
-  bool has_sign_col_flag() const { return flags_.sign_col; }
-  bool has_alt_flag() const { return flags_.alt; }
-  bool has_zero_flag() const { return flags_.zero; }
+  bool is_basic() const { return flags_ == Flags::kBasic; }
+  bool has_left_flag() const { return FlagsContains(flags_, Flags::kLeft); }
+  bool has_show_pos_flag() const {
+    return FlagsContains(flags_, Flags::kShowPos);
+  }
+  bool has_sign_col_flag() const {
+    return FlagsContains(flags_, Flags::kSignCol);
+  }
+  bool has_alt_flag() const { return FlagsContains(flags_, Flags::kAlt); }
+  bool has_zero_flag() const { return FlagsContains(flags_, Flags::kZero); }
 
   FormatConversionChar conversion_char() const {
     // Keep this field first in the struct . It generates better code when
@@ -306,7 +324,7 @@ struct FormatConversionSpecImplFriend final {
     conv->precision_ = p;
   }
   static std::string FlagsToString(const FormatConversionSpecImpl& spec) {
-    return spec.flags_.ToString();
+    return str_format_internal::FlagsToString(spec.flags_);
   }
 };
 
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc
index f308d02351..2c9c07dacc 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc
@@ -34,60 +34,67 @@ namespace str_format_internal {
 using CC = FormatConversionCharInternal;
 using LM = LengthMod;
 
+// Abbreviations to fit in the table below.
+constexpr auto f_sign = Flags::kSignCol;
+constexpr auto f_alt = Flags::kAlt;
+constexpr auto f_pos = Flags::kShowPos;
+constexpr auto f_left = Flags::kLeft;
+constexpr auto f_zero = Flags::kZero;
+
 ABSL_CONST_INIT const ConvTag kTags[256] = {
-    {},    {},    {},    {},    {},    {},    {},    {},     // 00-07
-    {},    {},    {},    {},    {},    {},    {},    {},     // 08-0f
-    {},    {},    {},    {},    {},    {},    {},    {},     // 10-17
-    {},    {},    {},    {},    {},    {},    {},    {},     // 18-1f
-    {},    {},    {},    {},    {},    {},    {},    {},     // 20-27
-    {},    {},    {},    {},    {},    {},    {},    {},     // 28-2f
-    {},    {},    {},    {},    {},    {},    {},    {},     // 30-37
-    {},    {},    {},    {},    {},    {},    {},    {},     // 38-3f
-    {},    CC::A, {},    {},    {},    CC::E, CC::F, CC::G,  // @ABCDEFG
-    {},    {},    {},    {},    LM::L, {},    {},    {},     // HIJKLMNO
-    {},    {},    {},    {},    {},    {},    {},    {},     // PQRSTUVW
-    CC::X, {},    {},    {},    {},    {},    {},    {},     // XYZ[\]^_
-    {},    CC::a, {},    CC::c, CC::d, CC::e, CC::f, CC::g,  // `abcdefg
-    LM::h, CC::i, LM::j, {},    LM::l, {},    CC::n, CC::o,  // hijklmno
-    CC::p, LM::q, {},    CC::s, LM::t, CC::u, {},    {},     // pqrstuvw
-    CC::x, {},    LM::z, {},    {},    {},    {},    {},     // xyz{|}!
-    {},    {},    {},    {},    {},    {},    {},    {},     // 80-87
-    {},    {},    {},    {},    {},    {},    {},    {},     // 88-8f
-    {},    {},    {},    {},    {},    {},    {},    {},     // 90-97
-    {},    {},    {},    {},    {},    {},    {},    {},     // 98-9f
-    {},    {},    {},    {},    {},    {},    {},    {},     // a0-a7
-    {},    {},    {},    {},    {},    {},    {},    {},     // a8-af
-    {},    {},    {},    {},    {},    {},    {},    {},     // b0-b7
-    {},    {},    {},    {},    {},    {},    {},    {},     // b8-bf
-    {},    {},    {},    {},    {},    {},    {},    {},     // c0-c7
-    {},    {},    {},    {},    {},    {},    {},    {},     // c8-cf
-    {},    {},    {},    {},    {},    {},    {},    {},     // d0-d7
-    {},    {},    {},    {},    {},    {},    {},    {},     // d8-df
-    {},    {},    {},    {},    {},    {},    {},    {},     // e0-e7
-    {},    {},    {},    {},    {},    {},    {},    {},     // e8-ef
-    {},    {},    {},    {},    {},    {},    {},    {},     // f0-f7
-    {},    {},    {},    {},    {},    {},    {},    {},     // f8-ff
+    {},     {},    {},    {},    {},    {},     {},    {},     // 00-07
+    {},     {},    {},    {},    {},    {},     {},    {},     // 08-0f
+    {},     {},    {},    {},    {},    {},     {},    {},     // 10-17
+    {},     {},    {},    {},    {},    {},     {},    {},     // 18-1f
+    f_sign, {},    {},    f_alt, {},    {},     {},    {},     //  !"#$%&'
+    {},     {},    {},    f_pos, {},    f_left, {},    {},     // ()*+,-./
+    f_zero, {},    {},    {},    {},    {},     {},    {},     // 01234567
+    {},     {},    {},    {},    {},    {},     {},    {},     // 89:;<=>?
+    {},     CC::A, {},    {},    {},    CC::E,  CC::F, CC::G,  // @ABCDEFG
+    {},     {},    {},    {},    LM::L, {},     {},    {},     // HIJKLMNO
+    {},     {},    {},    {},    {},    {},     {},    {},     // PQRSTUVW
+    CC::X,  {},    {},    {},    {},    {},     {},    {},     // XYZ[\]^_
+    {},     CC::a, {},    CC::c, CC::d, CC::e,  CC::f, CC::g,  // `abcdefg
+    LM::h,  CC::i, LM::j, {},    LM::l, {},     CC::n, CC::o,  // hijklmno
+    CC::p,  LM::q, {},    CC::s, LM::t, CC::u,  {},    {},     // pqrstuvw
+    CC::x,  {},    LM::z, {},    {},    {},     {},    {},     // xyz{|}!
+    {},     {},    {},    {},    {},    {},     {},    {},     // 80-87
+    {},     {},    {},    {},    {},    {},     {},    {},     // 88-8f
+    {},     {},    {},    {},    {},    {},     {},    {},     // 90-97
+    {},     {},    {},    {},    {},    {},     {},    {},     // 98-9f
+    {},     {},    {},    {},    {},    {},     {},    {},     // a0-a7
+    {},     {},    {},    {},    {},    {},     {},    {},     // a8-af
+    {},     {},    {},    {},    {},    {},     {},    {},     // b0-b7
+    {},     {},    {},    {},    {},    {},     {},    {},     // b8-bf
+    {},     {},    {},    {},    {},    {},     {},    {},     // c0-c7
+    {},     {},    {},    {},    {},    {},     {},    {},     // c8-cf
+    {},     {},    {},    {},    {},    {},     {},    {},     // d0-d7
+    {},     {},    {},    {},    {},    {},     {},    {},     // d8-df
+    {},     {},    {},    {},    {},    {},     {},    {},     // e0-e7
+    {},     {},    {},    {},    {},    {},     {},    {},     // e8-ef
+    {},     {},    {},    {},    {},    {},     {},    {},     // f0-f7
+    {},     {},    {},    {},    {},    {},     {},    {},     // f8-ff
 };
 
 namespace {
 
 bool CheckFastPathSetting(const UnboundConversion& conv) {
-  bool should_be_basic = !conv.flags.left &&      //
-                         !conv.flags.show_pos &&  //
-                         !conv.flags.sign_col &&  //
-                         !conv.flags.alt &&       //
-                         !conv.flags.zero &&      //
-                         (conv.width.value() == -1) &&
-                         (conv.precision.value() == -1);
-  if (should_be_basic != conv.flags.basic) {
+  bool width_precision_needed =
+      conv.width.value() >= 0 || conv.precision.value() >= 0;
+  if (width_precision_needed && conv.flags == Flags::kBasic) {
     fprintf(stderr,
             "basic=%d left=%d show_pos=%d sign_col=%d alt=%d zero=%d "
             "width=%d precision=%d\n",
-            conv.flags.basic, conv.flags.left, conv.flags.show_pos,
-            conv.flags.sign_col, conv.flags.alt, conv.flags.zero,
-            conv.width.value(), conv.precision.value());
+            conv.flags == Flags::kBasic ? 1 : 0,
+            FlagsContains(conv.flags, Flags::kLeft) ? 1 : 0,
+            FlagsContains(conv.flags, Flags::kShowPos) ? 1 : 0,
+            FlagsContains(conv.flags, Flags::kSignCol) ? 1 : 0,
+            FlagsContains(conv.flags, Flags::kAlt) ? 1 : 0,
+            FlagsContains(conv.flags, Flags::kZero) ? 1 : 0, conv.width.value(),
+            conv.precision.value());
+    return false;
   }
-  return should_be_basic == conv.flags.basic;
+  return true;
 }
 
 template <bool is_positional>
@@ -131,40 +138,21 @@ const char *ConsumeConversion(const char *pos, const char *const end,
   ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR();
 
   // We should start with the basic flag on.
-  assert(conv->flags.basic);
+  assert(conv->flags == Flags::kBasic);
 
   // Any non alpha character makes this conversion not basic.
   // This includes flags (-+ #0), width (1-9, *) or precision (.).
   // All conversion characters and length modifiers are alpha characters.
   if (c < 'A') {
-    conv->flags.basic = false;
-
-    for (; c <= '0';) {
-      // FIXME: We might be able to speed this up reusing the lookup table from
-      // above. It might require changing Flags to be a plain integer where we
-      // can |= a value.
-      switch (c) {
-        case '-':
-          conv->flags.left = true;
-          break;
-        case '+':
-          conv->flags.show_pos = true;
-          break;
-        case ' ':
-          conv->flags.sign_col = true;
-          break;
-        case '#':
-          conv->flags.alt = true;
-          break;
-        case '0':
-          conv->flags.zero = true;
-          break;
-        default:
-          goto flags_done;
+    while (c <= '0') {
+      auto tag = GetTagForChar(c);
+      if (tag.is_flags()) {
+        conv->flags = conv->flags | tag.as_flags();
+        ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR();
+      } else {
+        break;
       }
-      ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR();
     }
-flags_done:
 
     if (c <= '9') {
       if (c >= '0') {
@@ -173,12 +161,12 @@ flags_done:
           if (ABSL_PREDICT_FALSE(*next_arg != 0)) return nullptr;
           // Positional conversion.
           *next_arg = -1;
-          conv->flags = Flags();
-          conv->flags.basic = true;
           return ConsumeConversion<true>(original_pos, end, conv, next_arg);
         }
+        conv->flags = conv->flags | Flags::kNonBasic;
         conv->width.set_value(maybe_width);
       } else if (c == '*') {
+        conv->flags = conv->flags | Flags::kNonBasic;
         ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR();
         if (is_positional) {
           if (ABSL_PREDICT_FALSE(c < '1' || c > '9')) return nullptr;
@@ -192,6 +180,7 @@ flags_done:
     }
 
     if (c == '.') {
+      conv->flags = conv->flags | Flags::kNonBasic;
       ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR();
       if (std::isdigit(c)) {
         conv->precision.set_value(parse_digits());
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h
index 6504dd3ddc..ad8646edff 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h
@@ -41,10 +41,7 @@ std::string LengthModToString(LengthMod v);
 
 // The analyzed properties of a single specified conversion.
 struct UnboundConversion {
-  UnboundConversion()
-      : flags() /* This is required to zero all the fields of flags. */ {
-    flags.basic = true;
-  }
+  UnboundConversion() {}
 
   class InputValue {
    public:
@@ -79,7 +76,7 @@ struct UnboundConversion {
   InputValue width;
   InputValue precision;
 
-  Flags flags;
+  Flags flags = Flags::kBasic;
   LengthMod length_mod = LengthMod::none;
   FormatConversionChar conv = FormatConversionCharInternal::kNone;
 };
@@ -93,32 +90,43 @@ const char* ConsumeUnboundConversion(const char* p, const char* end,
                                      UnboundConversion* conv, int* next_arg);
 
 // Helper tag class for the table below.
-// It allows fast `char -> ConversionChar/LengthMod` checking and
+// It allows fast `char -> ConversionChar/LengthMod/Flags` checking and
 // conversions.
 class ConvTag {
  public:
   constexpr ConvTag(FormatConversionChar conversion_char)  // NOLINT
-      : tag_(static_cast<int8_t>(conversion_char)) {}
-  // We invert the length modifiers to make them negative so that we can easily
-  // test for them.
+      : tag_(static_cast<uint8_t>(conversion_char)) {}
   constexpr ConvTag(LengthMod length_mod)  // NOLINT
-      : tag_(~static_cast<std::int8_t>(length_mod)) {}
-  // Everything else is -128, which is negative to make is_conv() simpler.
-  constexpr ConvTag() : tag_(-128) {}
+      : tag_(0x80 | static_cast<uint8_t>(length_mod)) {}
+  constexpr ConvTag(Flags flags)  // NOLINT
+      : tag_(0xc0 | static_cast<uint8_t>(flags)) {}
+  constexpr ConvTag() : tag_(0xFF) {}
+
+  bool is_conv() const { return (tag_ & 0x80) == 0; }
+  bool is_length() const { return (tag_ & 0xC0) == 0x80; }
+  bool is_flags() const { return (tag_ & 0xE0) == 0xC0; }
 
-  bool is_conv() const { return tag_ >= 0; }
-  bool is_length() const { return tag_ < 0 && tag_ != -128; }
   FormatConversionChar as_conv() const {
     assert(is_conv());
+    assert(!is_length());
+    assert(!is_flags());
     return static_cast<FormatConversionChar>(tag_);
   }
   LengthMod as_length() const {
+    assert(!is_conv());
     assert(is_length());
-    return static_cast<LengthMod>(~tag_);
+    assert(!is_flags());
+    return static_cast<LengthMod>(tag_ & 0x3F);
+  }
+  Flags as_flags() const {
+    assert(!is_conv());
+    assert(!is_length());
+    assert(is_flags());
+    return static_cast<Flags>(tag_ & 0x1F);
   }
 
  private:
-  std::int8_t tag_;
+  uint8_t tag_;
 };
 
 extern const ConvTag kTags[256];
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc
index a5fa1c79aa..fe0d296360 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc
+++ b/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc
@@ -270,15 +270,22 @@ TEST_F(ConsumeUnboundConversionTest, Flags) {
       for (int k = 0; k < kNumFlags; ++k)
         if ((i >> k) & 1) fmt += kAllFlags[k];
       // flag order shouldn't matter
-      if (rev == 1) { std::reverse(fmt.begin(), fmt.end()); }
+      if (rev == 1) {
+        std::reverse(fmt.begin(), fmt.end());
+      }
       fmt += 'd';
       SCOPED_TRACE(fmt);
       EXPECT_TRUE(Run(fmt.c_str()));
-      EXPECT_EQ(fmt.find('-') == std::string::npos, !o.flags.left);
-      EXPECT_EQ(fmt.find('+') == std::string::npos, !o.flags.show_pos);
-      EXPECT_EQ(fmt.find(' ') == std::string::npos, !o.flags.sign_col);
-      EXPECT_EQ(fmt.find('#') == std::string::npos, !o.flags.alt);
-      EXPECT_EQ(fmt.find('0') == std::string::npos, !o.flags.zero);
+      EXPECT_EQ(fmt.find('-') == std::string::npos,
+                !FlagsContains(o.flags, Flags::kLeft));
+      EXPECT_EQ(fmt.find('+') == std::string::npos,
+                !FlagsContains(o.flags, Flags::kShowPos));
+      EXPECT_EQ(fmt.find(' ') == std::string::npos,
+                !FlagsContains(o.flags, Flags::kSignCol));
+      EXPECT_EQ(fmt.find('#') == std::string::npos,
+                !FlagsContains(o.flags, Flags::kAlt));
+      EXPECT_EQ(fmt.find('0') == std::string::npos,
+                !FlagsContains(o.flags, Flags::kZero));
     }
   }
 }
@@ -288,14 +295,14 @@ TEST_F(ConsumeUnboundConversionTest, BasicFlag) {
   for (const char* fmt : {"d", "llx", "G", "1$X"}) {
     SCOPED_TRACE(fmt);
     EXPECT_TRUE(Run(fmt));
-    EXPECT_TRUE(o.flags.basic);
+    EXPECT_EQ(o.flags, Flags::kBasic);
   }
 
   // Flag is off
   for (const char* fmt : {"3d", ".llx", "-G", "1$#X"}) {
     SCOPED_TRACE(fmt);
     EXPECT_TRUE(Run(fmt));
-    EXPECT_FALSE(o.flags.basic);
+    EXPECT_NE(o.flags, Flags::kBasic);
   }
 }
 
diff --git a/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h b/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h
index a2f41c1531..17c1bfe8d3 100644
--- a/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h
+++ b/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h
@@ -32,7 +32,7 @@
 #include <array>
 #include <initializer_list>
 #include <iterator>
-#include <map>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -182,6 +182,13 @@ template <typename T>
 struct HasConstIterator<T, absl::void_t<typename T::const_iterator>>
     : std::true_type {};
 
+// HasEmplace<T>::value is true iff there exists a method T::emplace().
+template <typename T, typename = void>
+struct HasEmplace : std::false_type {};
+template <typename T>
+struct HasEmplace<T, absl::void_t<decltype(std::declval<T>().emplace())>>
+    : std::true_type {};
+
 // IsInitializerList<T>::value is true iff T is an std::initializer_list. More
 // details below in Splitter<> where this is used.
 std::false_type IsInitializerListDispatch(...);  // default: No
@@ -372,50 +379,43 @@ class Splitter {
   // value.
   template <typename Container, typename First, typename Second>
   struct ConvertToContainer<Container, std::pair<const First, Second>, true> {
+    using iterator = typename Container::iterator;
+
     Container operator()(const Splitter& splitter) const {
       Container m;
-      typename Container::iterator it;
+      iterator it;
       bool insert = true;
-      for (const auto& sp : splitter) {
+      for (const absl::string_view sv : splitter) {
         if (insert) {
-          it = Inserter<Container>::Insert(&m, First(sp), Second());
+          it = InsertOrEmplace(&m, sv);
         } else {
-          it->second = Second(sp);
+          it->second = Second(sv);
         }
         insert = !insert;
       }
       return m;
     }
 
-    // Inserts the key and value into the given map, returning an iterator to
-    // the inserted item. Specialized for std::map and std::multimap to use
-    // emplace() and adapt emplace()'s return value.
-    template <typename Map>
-    struct Inserter {
-      using M = Map;
-      template <typename... Args>
-      static typename M::iterator Insert(M* m, Args&&... args) {
-        return m->insert(std::make_pair(std::forward<Args>(args)...)).first;
-      }
-    };
-
-    template <typename... Ts>
-    struct Inserter<std::map<Ts...>> {
-      using M = std::map<Ts...>;
-      template <typename... Args>
-      static typename M::iterator Insert(M* m, Args&&... args) {
-        return m->emplace(std::make_pair(std::forward<Args>(args)...)).first;
-      }
-    };
-
-    template <typename... Ts>
-    struct Inserter<std::multimap<Ts...>> {
-      using M = std::multimap<Ts...>;
-      template <typename... Args>
-      static typename M::iterator Insert(M* m, Args&&... args) {
-        return m->emplace(std::make_pair(std::forward<Args>(args)...));
-      }
-    };
+    // Inserts the key and an empty value into the map, returning an iterator to
+    // the inserted item. We use emplace() if available, otherwise insert().
+    template <typename M>
+    static absl::enable_if_t<HasEmplace<M>::value, iterator> InsertOrEmplace(
+        M* m, absl::string_view key) {
+      // Use piecewise_construct to support old versions of gcc in which pair
+      // constructor can't otherwise construct string from string_view.
+      return ToIter(m->emplace(std::piecewise_construct, std::make_tuple(key),
+                               std::tuple<>()));
+    }
+    template <typename M>
+    static absl::enable_if_t<!HasEmplace<M>::value, iterator> InsertOrEmplace(
+        M* m, absl::string_view key) {
+      return ToIter(m->insert(std::make_pair(First(key), Second(""))));
+    }
+
+    static iterator ToIter(std::pair<iterator, bool> pair) {
+      return pair.first;
+    }
+    static iterator ToIter(iterator iter) { return iter; }
   };
 
   StringType text_;
diff --git a/third_party/abseil-cpp/absl/strings/numbers.h b/third_party/abseil-cpp/absl/strings/numbers.h
index ffc738fa41..1780bb44bd 100644
--- a/third_party/abseil-cpp/absl/strings/numbers.h
+++ b/third_party/abseil-cpp/absl/strings/numbers.h
@@ -124,6 +124,7 @@ inline void PutTwoDigits(size_t i, char* buf) {
 }
 
 // safe_strto?() functions for implementing SimpleAtoi()
+
 bool safe_strto32_base(absl::string_view text, int32_t* value, int base);
 bool safe_strto64_base(absl::string_view text, int64_t* value, int base);
 bool safe_strto128_base(absl::string_view text, absl::int128* value,
diff --git a/third_party/abseil-cpp/absl/strings/str_cat.cc b/third_party/abseil-cpp/absl/strings/str_cat.cc
index dd5d25b0d6..f4a77493a4 100644
--- a/third_party/abseil-cpp/absl/strings/str_cat.cc
+++ b/third_party/abseil-cpp/absl/strings/str_cat.cc
@@ -174,7 +174,7 @@ void AppendPieces(std::string* dest,
     ASSERT_NO_OVERLAP(*dest, piece);
     total_size += piece.size();
   }
-  strings_internal::STLStringResizeUninitialized(dest, total_size);
+  strings_internal::STLStringResizeUninitializedAmortized(dest, total_size);
 
   char* const begin = &(*dest)[0];
   char* out = begin + old_size;
@@ -199,7 +199,7 @@ void StrAppend(std::string* dest, const AlphaNum& a, const AlphaNum& b) {
   ASSERT_NO_OVERLAP(*dest, a);
   ASSERT_NO_OVERLAP(*dest, b);
   std::string::size_type old_size = dest->size();
-  strings_internal::STLStringResizeUninitialized(
+  strings_internal::STLStringResizeUninitializedAmortized(
       dest, old_size + a.size() + b.size());
   char* const begin = &(*dest)[0];
   char* out = begin + old_size;
@@ -214,7 +214,7 @@ void StrAppend(std::string* dest, const AlphaNum& a, const AlphaNum& b,
   ASSERT_NO_OVERLAP(*dest, b);
   ASSERT_NO_OVERLAP(*dest, c);
   std::string::size_type old_size = dest->size();
-  strings_internal::STLStringResizeUninitialized(
+  strings_internal::STLStringResizeUninitializedAmortized(
       dest, old_size + a.size() + b.size() + c.size());
   char* const begin = &(*dest)[0];
   char* out = begin + old_size;
@@ -231,7 +231,7 @@ void StrAppend(std::string* dest, const AlphaNum& a, const AlphaNum& b,
   ASSERT_NO_OVERLAP(*dest, c);
   ASSERT_NO_OVERLAP(*dest, d);
   std::string::size_type old_size = dest->size();
-  strings_internal::STLStringResizeUninitialized(
+  strings_internal::STLStringResizeUninitializedAmortized(
       dest, old_size + a.size() + b.size() + c.size() + d.size());
   char* const begin = &(*dest)[0];
   char* out = begin + old_size;
diff --git a/third_party/abseil-cpp/absl/strings/str_split_test.cc b/third_party/abseil-cpp/absl/strings/str_split_test.cc
index 7f7c097fae..f472f9eda1 100644
--- a/third_party/abseil-cpp/absl/strings/str_split_test.cc
+++ b/third_party/abseil-cpp/absl/strings/str_split_test.cc
@@ -29,6 +29,8 @@
 #include "gtest/gtest.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/base/macros.h"
+#include "absl/container/btree_map.h"
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/strings/numbers.h"
@@ -405,6 +407,10 @@ TEST(Splitter, ConversionOperator) {
   TestConversionOperator<std::set<std::string>>(splitter);
   TestConversionOperator<std::multiset<absl::string_view>>(splitter);
   TestConversionOperator<std::multiset<std::string>>(splitter);
+  TestConversionOperator<absl::btree_set<absl::string_view>>(splitter);
+  TestConversionOperator<absl::btree_set<std::string>>(splitter);
+  TestConversionOperator<absl::btree_multiset<absl::string_view>>(splitter);
+  TestConversionOperator<absl::btree_multiset<std::string>>(splitter);
   TestConversionOperator<std::unordered_set<std::string>>(splitter);
 
   // Tests conversion to map-like objects.
@@ -421,6 +427,22 @@ TEST(Splitter, ConversionOperator) {
   TestMapConversionOperator<std::multimap<std::string, absl::string_view>>(
       splitter);
   TestMapConversionOperator<std::multimap<std::string, std::string>>(splitter);
+  TestMapConversionOperator<
+      absl::btree_map<absl::string_view, absl::string_view>>(splitter);
+  TestMapConversionOperator<absl::btree_map<absl::string_view, std::string>>(
+      splitter);
+  TestMapConversionOperator<absl::btree_map<std::string, absl::string_view>>(
+      splitter);
+  TestMapConversionOperator<absl::btree_map<std::string, std::string>>(
+      splitter);
+  TestMapConversionOperator<
+      absl::btree_multimap<absl::string_view, absl::string_view>>(splitter);
+  TestMapConversionOperator<
+      absl::btree_multimap<absl::string_view, std::string>>(splitter);
+  TestMapConversionOperator<
+      absl::btree_multimap<std::string, absl::string_view>>(splitter);
+  TestMapConversionOperator<absl::btree_multimap<std::string, std::string>>(
+      splitter);
   TestMapConversionOperator<std::unordered_map<std::string, std::string>>(
       splitter);
   TestMapConversionOperator<
diff --git a/third_party/abseil-cpp/absl/strings/string_view.cc b/third_party/abseil-cpp/absl/strings/string_view.cc
index c5f5de936d..d596e08cde 100644
--- a/third_party/abseil-cpp/absl/strings/string_view.cc
+++ b/third_party/abseil-cpp/absl/strings/string_view.cc
@@ -78,8 +78,8 @@ std::ostream& operator<<(std::ostream& o, string_view piece) {
   return o;
 }
 
-string_view::size_type string_view::find(string_view s, size_type pos) const
-    noexcept {
+string_view::size_type string_view::find(string_view s,
+                                         size_type pos) const noexcept {
   if (empty() || pos > length_) {
     if (empty() && pos == 0 && s.empty()) return 0;
     return npos;
@@ -98,8 +98,8 @@ string_view::size_type string_view::find(char c, size_type pos) const noexcept {
   return result != nullptr ? result - ptr_ : npos;
 }
 
-string_view::size_type string_view::rfind(string_view s, size_type pos) const
-    noexcept {
+string_view::size_type string_view::rfind(string_view s,
+                                          size_type pos) const noexcept {
   if (length_ < s.length_) return npos;
   if (s.empty()) return std::min(length_, pos);
   const char* last = ptr_ + std::min(length_ - s.length_, pos) + s.length_;
@@ -108,8 +108,8 @@ string_view::size_type string_view::rfind(string_view s, size_type pos) const
 }
 
 // Search range is [0..pos] inclusive.  If pos == npos, search everything.
-string_view::size_type string_view::rfind(char c, size_type pos) const
-    noexcept {
+string_view::size_type string_view::rfind(char c,
+                                          size_type pos) const noexcept {
   // Note: memrchr() is not available on Windows.
   if (empty()) return npos;
   for (size_type i = std::min(pos, length_ - 1);; --i) {
@@ -121,9 +121,8 @@ string_view::size_type string_view::rfind(char c, size_type pos) const
   return npos;
 }
 
-string_view::size_type string_view::find_first_of(string_view s,
-                                                  size_type pos) const
-    noexcept {
+string_view::size_type string_view::find_first_of(
+    string_view s, size_type pos) const noexcept {
   if (empty() || s.empty()) {
     return npos;
   }
@@ -138,9 +137,8 @@ string_view::size_type string_view::find_first_of(string_view s,
   return npos;
 }
 
-string_view::size_type string_view::find_first_not_of(string_view s,
-                                                      size_type pos) const
-    noexcept {
+string_view::size_type string_view::find_first_not_of(
+    string_view s, size_type pos) const noexcept {
   if (empty()) return npos;
   // Avoid the cost of LookupTable() for a single-character search.
   if (s.length_ == 1) return find_first_not_of(s.ptr_[0], pos);
@@ -153,9 +151,8 @@ string_view::size_type string_view::find_first_not_of(string_view s,
   return npos;
 }
 
-string_view::size_type string_view::find_first_not_of(char c,
-                                                      size_type pos) const
-    noexcept {
+string_view::size_type string_view::find_first_not_of(
+    char c, size_type pos) const noexcept {
   if (empty()) return npos;
   for (; pos < length_; ++pos) {
     if (ptr_[pos] != c) {
@@ -180,9 +177,8 @@ string_view::size_type string_view::find_last_of(string_view s,
   return npos;
 }
 
-string_view::size_type string_view::find_last_not_of(string_view s,
-                                                     size_type pos) const
-    noexcept {
+string_view::size_type string_view::find_last_not_of(
+    string_view s, size_type pos) const noexcept {
   if (empty()) return npos;
   size_type i = std::min(pos, length_ - 1);
   if (s.empty()) return i;
@@ -198,9 +194,8 @@ string_view::size_type string_view::find_last_not_of(string_view s,
   return npos;
 }
 
-string_view::size_type string_view::find_last_not_of(char c,
-                                                     size_type pos) const
-    noexcept {
+string_view::size_type string_view::find_last_not_of(
+    char c, size_type pos) const noexcept {
   if (empty()) return npos;
   size_type i = std::min(pos, length_ - 1);
   for (;; --i) {
diff --git a/third_party/abseil-cpp/absl/strings/string_view.h b/third_party/abseil-cpp/absl/strings/string_view.h
index 5260b5b73f..968549be46 100644
--- a/third_party/abseil-cpp/absl/strings/string_view.h
+++ b/third_party/abseil-cpp/absl/strings/string_view.h
@@ -36,6 +36,7 @@
 #include <limits>
 #include <string>
 
+#include "absl/base/attributes.h"
 #include "absl/base/config.h"
 #include "absl/base/internal/throw_delegate.h"
 #include "absl/base/macros.h"
@@ -61,6 +62,12 @@ ABSL_NAMESPACE_END
 #define ABSL_INTERNAL_STRING_VIEW_MEMCMP memcmp
 #endif  // ABSL_HAVE_BUILTIN(__builtin_memcmp)
 
+#if defined(__cplusplus) && __cplusplus >= 201402L
+#define ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR constexpr
+#else
+#define ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR
+#endif
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 
@@ -180,8 +187,8 @@ class string_view {
 
   template <typename Allocator>
   string_view(  // NOLINT(runtime/explicit)
-      const std::basic_string<char, std::char_traits<char>, Allocator>&
-          str) noexcept
+      const std::basic_string<char, std::char_traits<char>, Allocator>& str
+          ABSL_ATTRIBUTE_LIFETIME_BOUND) noexcept
       // This is implemented in terms of `string_view(p, n)` so `str.size()`
       // doesn't need to be reevaluated after `ptr_` is set.
       : string_view(str.data(), str.size()) {}
@@ -264,9 +271,7 @@ class string_view {
   // string_view::size()
   //
   // Returns the number of characters in the `string_view`.
-  constexpr size_type size() const noexcept {
-    return length_;
-  }
+  constexpr size_type size() const noexcept { return length_; }
 
   // string_view::length()
   //
@@ -333,7 +338,7 @@ class string_view {
   //
   // Removes the first `n` characters from the `string_view`. Note that the
   // underlying string is not changed, only the view.
-  void remove_prefix(size_type n) {
+  ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR void remove_prefix(size_type n) {
     ABSL_HARDENING_ASSERT(n <= length_);
     ptr_ += n;
     length_ -= n;
@@ -343,7 +348,7 @@ class string_view {
   //
   // Removes the last `n` characters from the `string_view`. Note that the
   // underlying string is not changed, only the view.
-  void remove_suffix(size_type n) {
+  ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR void remove_suffix(size_type n) {
     ABSL_HARDENING_ASSERT(n <= length_);
     length_ -= n;
   }
@@ -351,7 +356,7 @@ class string_view {
   // string_view::swap()
   //
   // Swaps this `string_view` with another `string_view`.
-  void swap(string_view& s) noexcept {
+  ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR void swap(string_view& s) noexcept {
     auto t = *this;
     *this = s;
     s = t;
@@ -388,7 +393,7 @@ class string_view {
   // `n`) as another string_view. This function throws `std::out_of_bounds` if
   // `pos > size`.
   // Use absl::ClippedSubstr if you need a truncating substr operation.
-  constexpr string_view substr(size_type pos, size_type n = npos) const {
+  constexpr string_view substr(size_type pos = 0, size_type n = npos) const {
     return ABSL_PREDICT_FALSE(pos > length_)
                ? (base_internal::ThrowStdOutOfRange(
                       "absl::string_view::substr"),
@@ -398,12 +403,10 @@ class string_view {
 
   // string_view::compare()
   //
-  // Performs a lexicographical comparison between the `string_view` and
-  // another `absl::string_view`, returning -1 if `this` is less than, 0 if
-  // `this` is equal to, and 1 if `this` is greater than the passed string
-  // view. Note that in the case of data equality, a further comparison is made
-  // on the respective sizes of the two `string_view`s to determine which is
-  // smaller, equal, or greater.
+  // Performs a lexicographical comparison between this `string_view` and
+  // another `string_view` `x`, returning a negative value if `*this` is less
+  // than `x`, 0 if `*this` is equal to `x`, and a positive value if `*this`
+  // is greater than `x`.
   constexpr int compare(string_view x) const noexcept {
     return CompareImpl(length_, x.length_,
                        Min(length_, x.length_) == 0
@@ -414,31 +417,31 @@ class string_view {
 
   // Overload of `string_view::compare()` for comparing a substring of the
   // 'string_view` and another `absl::string_view`.
-  int compare(size_type pos1, size_type count1, string_view v) const {
+  constexpr int compare(size_type pos1, size_type count1, string_view v) const {
     return substr(pos1, count1).compare(v);
   }
 
   // Overload of `string_view::compare()` for comparing a substring of the
   // `string_view` and a substring of another `absl::string_view`.
-  int compare(size_type pos1, size_type count1, string_view v, size_type pos2,
-              size_type count2) const {
+  constexpr int compare(size_type pos1, size_type count1, string_view v,
+                        size_type pos2, size_type count2) const {
     return substr(pos1, count1).compare(v.substr(pos2, count2));
   }
 
   // Overload of `string_view::compare()` for comparing a `string_view` and a
-  // a different  C-style string `s`.
-  int compare(const char* s) const { return compare(string_view(s)); }
+  // a different C-style string `s`.
+  constexpr int compare(const char* s) const { return compare(string_view(s)); }
 
   // Overload of `string_view::compare()` for comparing a substring of the
   // `string_view` and a different string C-style string `s`.
-  int compare(size_type pos1, size_type count1, const char* s) const {
+  constexpr int compare(size_type pos1, size_type count1, const char* s) const {
     return substr(pos1, count1).compare(string_view(s));
   }
 
   // Overload of `string_view::compare()` for comparing a substring of the
   // `string_view` and a substring of a different C-style string `s`.
-  int compare(size_type pos1, size_type count1, const char* s,
-              size_type count2) const {
+  constexpr int compare(size_type pos1, size_type count1, const char* s,
+                        size_type count2) const {
     return substr(pos1, count1).compare(string_view(s, count2));
   }
 
@@ -455,48 +458,92 @@ class string_view {
   // within the `string_view`.
   size_type find(char c, size_type pos = 0) const noexcept;
 
+  // Overload of `string_view::find()` for finding a substring of a different
+  // C-style string `s` within the `string_view`.
+  size_type find(const char* s, size_type pos, size_type count) const {
+    return find(string_view(s, count), pos);
+  }
+
+  // Overload of `string_view::find()` for finding a different C-style string
+  // `s` within the `string_view`.
+  size_type find(const char* s, size_type pos = 0) const {
+    return find(string_view(s), pos);
+  }
+
   // string_view::rfind()
   //
   // Finds the last occurrence of a substring `s` within the `string_view`,
   // returning the position of the first character's match, or `npos` if no
   // match was found.
-  size_type rfind(string_view s, size_type pos = npos) const
-      noexcept;
+  size_type rfind(string_view s, size_type pos = npos) const noexcept;
 
   // Overload of `string_view::rfind()` for finding the last given character `c`
   // within the `string_view`.
   size_type rfind(char c, size_type pos = npos) const noexcept;
 
+  // Overload of `string_view::rfind()` for finding a substring of a different
+  // C-style string `s` within the `string_view`.
+  size_type rfind(const char* s, size_type pos, size_type count) const {
+    return rfind(string_view(s, count), pos);
+  }
+
+  // Overload of `string_view::rfind()` for finding a different C-style string
+  // `s` within the `string_view`.
+  size_type rfind(const char* s, size_type pos = npos) const {
+    return rfind(string_view(s), pos);
+  }
+
   // string_view::find_first_of()
   //
   // Finds the first occurrence of any of the characters in `s` within the
   // `string_view`, returning the start position of the match, or `npos` if no
   // match was found.
-  size_type find_first_of(string_view s, size_type pos = 0) const
-      noexcept;
+  size_type find_first_of(string_view s, size_type pos = 0) const noexcept;
 
   // Overload of `string_view::find_first_of()` for finding a character `c`
   // within the `string_view`.
-  size_type find_first_of(char c, size_type pos = 0) const
-      noexcept {
+  size_type find_first_of(char c, size_type pos = 0) const noexcept {
     return find(c, pos);
   }
 
+  // Overload of `string_view::find_first_of()` for finding a substring of a
+  // different C-style string `s` within the `string_view`.
+  size_type find_first_of(const char* s, size_type pos,
+                                    size_type count) const {
+    return find_first_of(string_view(s, count), pos);
+  }
+
+  // Overload of `string_view::find_first_of()` for finding a different C-style
+  // string `s` within the `string_view`.
+  size_type find_first_of(const char* s, size_type pos = 0) const {
+    return find_first_of(string_view(s), pos);
+  }
+
   // string_view::find_last_of()
   //
   // Finds the last occurrence of any of the characters in `s` within the
   // `string_view`, returning the start position of the match, or `npos` if no
   // match was found.
-  size_type find_last_of(string_view s, size_type pos = npos) const
-      noexcept;
+  size_type find_last_of(string_view s, size_type pos = npos) const noexcept;
 
   // Overload of `string_view::find_last_of()` for finding a character `c`
   // within the `string_view`.
-  size_type find_last_of(char c, size_type pos = npos) const
-      noexcept {
+  size_type find_last_of(char c, size_type pos = npos) const noexcept {
     return rfind(c, pos);
   }
 
+  // Overload of `string_view::find_last_of()` for finding a substring of a
+  // different C-style string `s` within the `string_view`.
+  size_type find_last_of(const char* s, size_type pos, size_type count) const {
+    return find_last_of(string_view(s, count), pos);
+  }
+
+  // Overload of `string_view::find_last_of()` for finding a different C-style
+  // string `s` within the `string_view`.
+  size_type find_last_of(const char* s, size_type pos = npos) const {
+    return find_last_of(string_view(s), pos);
+  }
+
   // string_view::find_first_not_of()
   //
   // Finds the first occurrence of any of the characters not in `s` within the
@@ -508,18 +555,43 @@ class string_view {
   // that is not `c` within the `string_view`.
   size_type find_first_not_of(char c, size_type pos = 0) const noexcept;
 
+  // Overload of `string_view::find_first_not_of()` for finding a substring of a
+  // different C-style string `s` within the `string_view`.
+  size_type find_first_not_of(const char* s, size_type pos,
+                              size_type count) const {
+    return find_first_not_of(string_view(s, count), pos);
+  }
+
+  // Overload of `string_view::find_first_not_of()` for finding a different
+  // C-style string `s` within the `string_view`.
+  size_type find_first_not_of(const char* s, size_type pos = 0) const {
+    return find_first_not_of(string_view(s), pos);
+  }
+
   // string_view::find_last_not_of()
   //
   // Finds the last occurrence of any of the characters not in `s` within the
   // `string_view`, returning the start position of the last non-match, or
   // `npos` if no non-match was found.
   size_type find_last_not_of(string_view s,
-                                          size_type pos = npos) const noexcept;
+                             size_type pos = npos) const noexcept;
 
   // Overload of `string_view::find_last_not_of()` for finding a character
   // that is not `c` within the `string_view`.
-  size_type find_last_not_of(char c, size_type pos = npos) const
-      noexcept;
+  size_type find_last_not_of(char c, size_type pos = npos) const noexcept;
+
+  // Overload of `string_view::find_last_not_of()` for finding a substring of a
+  // different C-style string `s` within the `string_view`.
+  size_type find_last_not_of(const char* s, size_type pos,
+                             size_type count) const {
+    return find_last_not_of(string_view(s, count), pos);
+  }
+
+  // Overload of `string_view::find_last_not_of()` for finding a different
+  // C-style string `s` within the `string_view`.
+  size_type find_last_not_of(const char* s, size_type pos = npos) const {
+    return find_last_not_of(string_view(s), pos);
+  }
 
  private:
   static constexpr size_type kMaxSize =
@@ -597,6 +669,7 @@ std::ostream& operator<<(std::ostream& o, string_view piece);
 ABSL_NAMESPACE_END
 }  // namespace absl
 
+#undef ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR
 #undef ABSL_INTERNAL_STRING_VIEW_MEMCMP
 
 #endif  // ABSL_USES_STD_STRING_VIEW
diff --git a/third_party/abseil-cpp/absl/strings/string_view_test.cc b/third_party/abseil-cpp/absl/strings/string_view_test.cc
index 643af8f81b..2c13dd1c14 100644
--- a/third_party/abseil-cpp/absl/strings/string_view_test.cc
+++ b/third_party/abseil-cpp/absl/strings/string_view_test.cc
@@ -449,6 +449,24 @@ TEST(StringViewTest, STL2) {
   EXPECT_EQ(d.find('x', 4), absl::string_view::npos);
   EXPECT_EQ(e.find('x', 7), absl::string_view::npos);
 
+  EXPECT_EQ(a.find(b.data(), 1, 0), 1);
+  EXPECT_EQ(a.find(c.data(), 9, 0), 9);
+  EXPECT_EQ(a.find(c.data(), absl::string_view::npos, 0),
+            absl::string_view::npos);
+  EXPECT_EQ(b.find(c.data(), absl::string_view::npos, 0),
+            absl::string_view::npos);
+  // empty string nonsense
+  EXPECT_EQ(d.find(b.data(), 4, 0), absl::string_view::npos);
+  EXPECT_EQ(e.find(b.data(), 7, 0), absl::string_view::npos);
+
+  EXPECT_EQ(a.find(b.data(), 1), absl::string_view::npos);
+  EXPECT_EQ(a.find(c.data(), 9), 23);
+  EXPECT_EQ(a.find(c.data(), absl::string_view::npos), absl::string_view::npos);
+  EXPECT_EQ(b.find(c.data(), absl::string_view::npos), absl::string_view::npos);
+  // empty string nonsense
+  EXPECT_EQ(d.find(b.data(), 4), absl::string_view::npos);
+  EXPECT_EQ(e.find(b.data(), 7), absl::string_view::npos);
+
   EXPECT_EQ(a.rfind(b), 0);
   EXPECT_EQ(a.rfind(b, 1), 0);
   EXPECT_EQ(a.rfind(c), 23);
@@ -490,6 +508,14 @@ TEST(StringViewTest, STL2) {
   EXPECT_EQ(e.rfind('o'), absl::string_view::npos);
   EXPECT_EQ(d.rfind('o', 4), absl::string_view::npos);
   EXPECT_EQ(e.rfind('o', 7), absl::string_view::npos);
+
+  EXPECT_EQ(a.rfind(b.data(), 1, 0), 1);
+  EXPECT_EQ(a.rfind(c.data(), 22, 0), 22);
+  EXPECT_EQ(a.rfind(c.data(), 1, 0), 1);
+  EXPECT_EQ(a.rfind(c.data(), 0, 0), 0);
+  EXPECT_EQ(b.rfind(c.data(), 0, 0), 0);
+  EXPECT_EQ(d.rfind(b.data(), 4, 0), 0);
+  EXPECT_EQ(e.rfind(b.data(), 7, 0), 0);
 }
 
 // Continued from STL2
@@ -678,6 +704,7 @@ TEST(StringViewTest, STL2Substr) {
   EXPECT_EQ(a.substr(23, 3), c);
   EXPECT_EQ(a.substr(23, 99), c);
   EXPECT_EQ(a.substr(0), a);
+  EXPECT_EQ(a.substr(), a);
   EXPECT_EQ(a.substr(3, 2), "de");
   // empty string nonsense
   EXPECT_EQ(d.substr(0, 99), e);
@@ -1087,7 +1114,24 @@ TEST(StringViewTest, ConstexprCompiles) {
   EXPECT_EQ(sp_npos, -1);
 }
 
-TEST(StringViewTest, ConstexprSubstr) {
+constexpr char ConstexprMethodsHelper() {
+#if defined(__cplusplus) && __cplusplus >= 201402L
+  absl::string_view str("123", 3);
+  str.remove_prefix(1);
+  str.remove_suffix(1);
+  absl::string_view bar;
+  str.swap(bar);
+  return bar.front();
+#else
+  return '2';
+#endif
+}
+
+TEST(StringViewTest, ConstexprMethods) {
+  // remove_prefix, remove_suffix, swap
+  static_assert(ConstexprMethodsHelper() == '2', "");
+
+  // substr
   constexpr absl::string_view foobar("foobar", 6);
   constexpr absl::string_view foo = foobar.substr(0, 3);
   constexpr absl::string_view bar = foobar.substr(3);
diff --git a/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt b/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt
index e633d0bf53..605efe2d02 100644
--- a/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt
@@ -95,7 +95,7 @@ absl_cc_test(
   DEPS
     absl::synchronization
     absl::time
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -108,7 +108,7 @@ absl_cc_test(
   DEPS
     absl::synchronization
     absl::time
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -122,7 +122,7 @@ absl_cc_test(
     absl::graphcycles_internal
     absl::core_headers
     absl::raw_logging_internal
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -154,7 +154,7 @@ absl_cc_test(
     absl::memory
     absl::raw_logging_internal
     absl::time
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -167,7 +167,7 @@ absl_cc_test(
   DEPS
     absl::synchronization
     absl::time
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -183,7 +183,7 @@ absl_cc_library(
     absl::config
     absl::strings
     absl::time
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -199,7 +199,7 @@ absl_cc_test(
     absl::synchronization
     absl::strings
     absl::time
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc b/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc
index 3cea7aed24..d2f82da3bb 100644
--- a/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc
+++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc
@@ -14,41 +14,51 @@
 
 #include "absl/synchronization/blocking_counter.h"
 
+#include <atomic>
+
 #include "absl/base/internal/raw_logging.h"
 
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 
-// Return whether int *arg is zero.
-static bool IsZero(void *arg) {
-  return 0 == *reinterpret_cast<int *>(arg);
+namespace {
+
+// Return whether int *arg is true.
+bool IsDone(void *arg) { return *reinterpret_cast<bool *>(arg); }
+
+}  // namespace
+
+BlockingCounter::BlockingCounter(int initial_count)
+    : count_(initial_count),
+      num_waiting_(0),
+      done_{initial_count == 0 ? true : false} {
+  ABSL_RAW_CHECK(initial_count >= 0, "BlockingCounter initial_count negative");
 }
 
 bool BlockingCounter::DecrementCount() {
-  MutexLock l(&lock_);
-  count_--;
-  if (count_ < 0) {
-    ABSL_RAW_LOG(
-        FATAL,
-        "BlockingCounter::DecrementCount() called too many times.  count=%d",
-        count_);
+  int count = count_.fetch_sub(1, std::memory_order_acq_rel) - 1;
+  ABSL_RAW_CHECK(count >= 0,
+                 "BlockingCounter::DecrementCount() called too many times");
+  if (count == 0) {
+    MutexLock l(&lock_);
+    done_ = true;
+    return true;
   }
-  return count_ == 0;
+  return false;
 }
 
 void BlockingCounter::Wait() {
   MutexLock l(&this->lock_);
-  ABSL_RAW_CHECK(count_ >= 0, "BlockingCounter underflow");
 
   // only one thread may call Wait(). To support more than one thread,
   // implement a counter num_to_exit, like in the Barrier class.
   ABSL_RAW_CHECK(num_waiting_ == 0, "multiple threads called Wait()");
   num_waiting_++;
 
-  this->lock_.Await(Condition(IsZero, &this->count_));
+  this->lock_.Await(Condition(IsDone, &this->done_));
 
-  // At this point, We know that all threads executing DecrementCount have
-  // released the lock, and so will not touch this object again.
+  // At this point, we know that all threads executing DecrementCount
+  // will not touch this object again.
   // Therefore, the thread calling this method is free to delete the object
   // after we return from this method.
 }
diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter.h b/third_party/abseil-cpp/absl/synchronization/blocking_counter.h
index 1f53f9f240..1908fdb1d9 100644
--- a/third_party/abseil-cpp/absl/synchronization/blocking_counter.h
+++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter.h
@@ -20,6 +20,8 @@
 #ifndef ABSL_SYNCHRONIZATION_BLOCKING_COUNTER_H_
 #define ABSL_SYNCHRONIZATION_BLOCKING_COUNTER_H_
 
+#include <atomic>
+
 #include "absl/base/thread_annotations.h"
 #include "absl/synchronization/mutex.h"
 
@@ -60,8 +62,7 @@ ABSL_NAMESPACE_BEGIN
 //
 class BlockingCounter {
  public:
-  explicit BlockingCounter(int initial_count)
-      : count_(initial_count), num_waiting_(0) {}
+  explicit BlockingCounter(int initial_count);
 
   BlockingCounter(const BlockingCounter&) = delete;
   BlockingCounter& operator=(const BlockingCounter&) = delete;
@@ -89,8 +90,9 @@ class BlockingCounter {
 
  private:
   Mutex lock_;
-  int count_ ABSL_GUARDED_BY(lock_);
+  std::atomic<int> count_;
   int num_waiting_ ABSL_GUARDED_BY(lock_);
+  bool done_ ABSL_GUARDED_BY(lock_);
 };
 
 ABSL_NAMESPACE_END
diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter_benchmark.cc b/third_party/abseil-cpp/absl/synchronization/blocking_counter_benchmark.cc
new file mode 100644
index 0000000000..b504d1a57c
--- /dev/null
+++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter_benchmark.cc
@@ -0,0 +1,83 @@
+// Copyright 2021 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits>
+
+#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/internal/thread_pool.h"
+#include "benchmark/benchmark.h"
+
+namespace {
+
+void BM_BlockingCounter_SingleThread(benchmark::State& state) {
+  for (auto _ : state) {
+    int iterations = state.range(0);
+    absl::BlockingCounter counter{iterations};
+    for (int i = 0; i < iterations; ++i) {
+      counter.DecrementCount();
+    }
+    counter.Wait();
+  }
+}
+BENCHMARK(BM_BlockingCounter_SingleThread)
+    ->ArgName("iterations")
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(16)
+    ->Arg(64)
+    ->Arg(256);
+
+void BM_BlockingCounter_DecrementCount(benchmark::State& state) {
+  static absl::BlockingCounter* counter =
+      new absl::BlockingCounter{std::numeric_limits<int>::max()};
+  for (auto _ : state) {
+    counter->DecrementCount();
+  }
+}
+BENCHMARK(BM_BlockingCounter_DecrementCount)
+    ->Threads(2)
+    ->Threads(4)
+    ->Threads(6)
+    ->Threads(8)
+    ->Threads(10)
+    ->Threads(12)
+    ->Threads(16)
+    ->Threads(32)
+    ->Threads(64)
+    ->Threads(128);
+
+void BM_BlockingCounter_Wait(benchmark::State& state) {
+  int num_threads = state.range(0);
+  absl::synchronization_internal::ThreadPool pool(num_threads);
+  for (auto _ : state) {
+    absl::BlockingCounter counter{num_threads};
+    pool.Schedule([num_threads, &counter, &pool]() {
+      for (int i = 0; i < num_threads; ++i) {
+        pool.Schedule([&counter]() { counter.DecrementCount(); });
+      }
+    });
+    counter.Wait();
+  }
+}
+BENCHMARK(BM_BlockingCounter_Wait)
+    ->ArgName("threads")
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128);
+
+}  // namespace
diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc b/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc
index 2926224af7..06885f5759 100644
--- a/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc
+++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc
@@ -63,6 +63,18 @@ TEST(BlockingCounterTest, BasicFunctionality) {
   }
 }
 
+TEST(BlockingCounterTest, WaitZeroInitialCount) {
+  BlockingCounter counter(0);
+  counter.Wait();
+}
+
+#if GTEST_HAS_DEATH_TEST
+TEST(BlockingCounterTest, WaitNegativeInitialCount) {
+  EXPECT_DEATH(BlockingCounter counter(-1),
+               "BlockingCounter initial_count negative");
+}
+#endif
+
 }  // namespace
 ABSL_NAMESPACE_END
 }  // namespace absl
diff --git a/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc b/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc
index 8cf59e64e9..db1184e679 100644
--- a/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc
+++ b/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc
@@ -159,7 +159,7 @@ TEST_F(PerThreadSemTest, Timeouts) {
   const absl::Duration elapsed = absl::Now() - start;
   // Allow for a slight early return, to account for quality of implementation
   // issues on various platforms.
-  const absl::Duration slop = absl::Microseconds(200);
+  const absl::Duration slop = absl::Milliseconds(1);
   EXPECT_LE(delay - slop, elapsed)
       << "Wait returned " << delay - elapsed
       << " early (with " << slop << " slop), start time was " << start;
diff --git a/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc b/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc
index 2123be60f5..28ef311e4a 100644
--- a/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc
+++ b/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc
@@ -79,6 +79,7 @@ bool Waiter::Wait(KernelTimeout t) {
   // Note that, since the thread ticker is just reset, we don't need to check
   // whether the thread is idle on the very first pass of the loop.
   bool first_pass = true;
+
   while (true) {
     int32_t x = futex_.load(std::memory_order_relaxed);
     while (x != 0) {
@@ -90,7 +91,6 @@ bool Waiter::Wait(KernelTimeout t) {
       return true;  // Consumed a wakeup, we are done.
     }
 
-
     if (!first_pass) MaybeBecomeIdle();
     const int err = Futex::WaitUntil(&futex_, 0, t);
     if (err != 0) {
diff --git a/third_party/abseil-cpp/absl/synchronization/mutex_test.cc b/third_party/abseil-cpp/absl/synchronization/mutex_test.cc
index 058f757b48..f8fbf9488c 100644
--- a/third_party/abseil-cpp/absl/synchronization/mutex_test.cc
+++ b/third_party/abseil-cpp/absl/synchronization/mutex_test.cc
@@ -852,7 +852,7 @@ TEST(Mutex, MutexReaderDecrementBug) ABSL_NO_THREAD_SAFETY_ANALYSIS {
 // held and then destroyed (w/o unlocking).
 #ifdef ABSL_HAVE_THREAD_SANITIZER
 // TSAN reports errors when locked Mutexes are destroyed.
-TEST(Mutex, DISABLED_LockedMutexDestructionBug) NO_THREAD_SAFETY_ANALYSIS {
+TEST(Mutex, DISABLED_LockedMutexDestructionBug) ABSL_NO_THREAD_SAFETY_ANALYSIS {
 #else
 TEST(Mutex, LockedMutexDestructionBug) ABSL_NO_THREAD_SAFETY_ANALYSIS {
 #endif
@@ -1153,7 +1153,7 @@ TEST(Mutex, DeadlockDetectorStressTest) ABSL_NO_THREAD_SAFETY_ANALYSIS {
 
 #ifdef ABSL_HAVE_THREAD_SANITIZER
 // TSAN reports errors when locked Mutexes are destroyed.
-TEST(Mutex, DISABLED_DeadlockIdBug) NO_THREAD_SAFETY_ANALYSIS {
+TEST(Mutex, DISABLED_DeadlockIdBug) ABSL_NO_THREAD_SAFETY_ANALYSIS {
 #else
 TEST(Mutex, DeadlockIdBug) ABSL_NO_THREAD_SAFETY_ANALYSIS {
 #endif
diff --git a/third_party/abseil-cpp/absl/time/CMakeLists.txt b/third_party/abseil-cpp/absl/time/CMakeLists.txt
index 00bdd499c1..f6ff8bd127 100644
--- a/third_party/abseil-cpp/absl/time/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/time/CMakeLists.txt
@@ -102,7 +102,7 @@ absl_cc_library(
     absl::config
     absl::raw_logging_internal
     absl::time_zone
-    gmock
+    GTest::gmock
   TESTONLY
 )
 
@@ -124,5 +124,5 @@ absl_cc_test(
     absl::config
     absl::core_headers
     absl::time_zone
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/time/civil_time.cc b/third_party/abseil-cpp/absl/time/civil_time.cc
index bdfe9ce0ef..6a231edb2d 100644
--- a/third_party/abseil-cpp/absl/time/civil_time.cc
+++ b/third_party/abseil-cpp/absl/time/civil_time.cc
@@ -38,9 +38,7 @@ std::string FormatYearAnd(string_view fmt, CivilSecond cs) {
   const CivilSecond ncs(NormalizeYear(cs.year()), cs.month(), cs.day(),
                         cs.hour(), cs.minute(), cs.second());
   const TimeZone utc = UTCTimeZone();
-  // TODO(absl-team): Avoid conversion of fmt string.
-  return StrCat(cs.year(),
-                FormatTime(std::string(fmt), FromCivil(ncs, utc), utc));
+  return StrCat(cs.year(), FormatTime(fmt, FromCivil(ncs, utc), utc));
 }
 
 template <typename CivilT>
diff --git a/third_party/abseil-cpp/absl/time/duration_test.cc b/third_party/abseil-cpp/absl/time/duration_test.cc
index fb28fa987f..b7209e1c0a 100644
--- a/third_party/abseil-cpp/absl/time/duration_test.cc
+++ b/third_party/abseil-cpp/absl/time/duration_test.cc
@@ -17,6 +17,7 @@
 #endif
 
 #include <chrono>  // NOLINT(build/c++11)
+#include <cfloat>
 #include <cmath>
 #include <cstdint>
 #include <ctime>
@@ -1320,7 +1321,7 @@ TEST(Duration, SmallConversions) {
 
   EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(0));
   // TODO(bww): Is the next one OK?
-  EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(0.124999999e-9));
+  EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(std::nextafter(0.125e-9, 0)));
   EXPECT_EQ(absl::Nanoseconds(1) / 4, absl::Seconds(0.125e-9));
   EXPECT_EQ(absl::Nanoseconds(1) / 4, absl::Seconds(0.250e-9));
   EXPECT_EQ(absl::Nanoseconds(1) / 2, absl::Seconds(0.375e-9));
@@ -1330,7 +1331,7 @@ TEST(Duration, SmallConversions) {
   EXPECT_EQ(absl::Nanoseconds(1), absl::Seconds(0.875e-9));
   EXPECT_EQ(absl::Nanoseconds(1), absl::Seconds(1.000e-9));
 
-  EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(-0.124999999e-9));
+  EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(std::nextafter(-0.125e-9, 0)));
   EXPECT_EQ(-absl::Nanoseconds(1) / 4, absl::Seconds(-0.125e-9));
   EXPECT_EQ(-absl::Nanoseconds(1) / 4, absl::Seconds(-0.250e-9));
   EXPECT_EQ(-absl::Nanoseconds(1) / 2, absl::Seconds(-0.375e-9));
@@ -1390,6 +1391,14 @@ void VerifyApproxSameAsMul(double time_as_seconds, int* const misses) {
 // Seconds(point) returns a duration near point * Seconds(1.0). (They may
 // not be exactly equal due to fused multiply/add contraction.)
 TEST(Duration, ToDoubleSecondsCheckEdgeCases) {
+#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0
+  // We're using an x87-compatible FPU, and intermediate operations can be
+  // performed with 80-bit floats. This means the edge cases are different than
+  // what we expect here, so just skip this test.
+  GTEST_SKIP()
+      << "Skipping the test because we detected x87 floating-point semantics";
+#endif
+
   constexpr uint32_t kTicksPerSecond = absl::time_internal::kTicksPerSecond;
   constexpr auto duration_tick = absl::time_internal::MakeDuration(0, 1u);
   int misses = 0;
diff --git a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc
index 303c0244a8..f2b3294ef7 100644
--- a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc
+++ b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc
@@ -53,7 +53,7 @@ int Parse02d(const char* p) {
 }  // namespace
 
 bool FixedOffsetFromName(const std::string& name, seconds* offset) {
-  if (name.compare(0, std::string::npos, "UTC", 3) == 0) {
+  if (name == "UTC" || name == "UTC0") {
     *offset = seconds::zero();
     return true;
   }
diff --git a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc
index a11f93e2a5..294f2e2284 100644
--- a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc
+++ b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc
@@ -1135,7 +1135,7 @@ TEST(Parse, ExtendedSeconds) {
   // All %E<prec>S cases are treated the same as %E*S on input.
   auto precisions = {"*", "0", "1",  "2",  "3",  "4",  "5",  "6", "7",
                      "8", "9", "10", "11", "12", "13", "14", "15"};
-  for (const std::string& prec : precisions) {
+  for (const std::string prec : precisions) {
     const std::string fmt = "%E" + prec + "S";
     SCOPED_TRACE(fmt);
     time_point<chrono::nanoseconds> tp = unix_epoch;
@@ -1217,7 +1217,7 @@ TEST(Parse, ExtendedSubeconds) {
   // All %E<prec>f cases are treated the same as %E*f on input.
   auto precisions = {"*", "0", "1",  "2",  "3",  "4",  "5",  "6", "7",
                      "8", "9", "10", "11", "12", "13", "14", "15"};
-  for (const std::string& prec : precisions) {
+  for (const std::string prec : precisions) {
     const std::string fmt = "%E" + prec + "f";
     SCOPED_TRACE(fmt);
     time_point<chrono::nanoseconds> tp = unix_epoch - chrono::seconds(1);
diff --git a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc
index 9a1a8d6e40..6948c3ea2c 100644
--- a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc
+++ b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc
@@ -717,6 +717,18 @@ TEST(TimeZones, LoadZonesConcurrently) {
 }
 #endif
 
+TEST(TimeZone, UTC) {
+  const time_zone utc = utc_time_zone();
+
+  time_zone loaded_utc;
+  EXPECT_TRUE(load_time_zone("UTC", &loaded_utc));
+  EXPECT_EQ(loaded_utc, utc);
+
+  time_zone loaded_utc0;
+  EXPECT_TRUE(load_time_zone("UTC0", &loaded_utc0));
+  EXPECT_EQ(loaded_utc0, utc);
+}
+
 TEST(TimeZone, NamedTimeZones) {
   const time_zone utc = utc_time_zone();
   EXPECT_EQ("UTC", utc.name());
diff --git a/third_party/abseil-cpp/absl/time/time.h b/third_party/abseil-cpp/absl/time/time.h
index d9ad1aedd8..48982df45a 100644
--- a/third_party/abseil-cpp/absl/time/time.h
+++ b/third_party/abseil-cpp/absl/time/time.h
@@ -1180,11 +1180,15 @@ inline Time FromDateTime(int64_t year, int mon, int day, int hour,
 //
 // Converts the `tm_year`, `tm_mon`, `tm_mday`, `tm_hour`, `tm_min`, and
 // `tm_sec` fields to an `absl::Time` using the given time zone. See ctime(3)
-// for a description of the expected values of the tm fields. If the indicated
-// time instant is not unique (see `absl::TimeZone::At(absl::CivilSecond)`
-// above), the `tm_isdst` field is consulted to select the desired instant
-// (`tm_isdst` > 0 means DST, `tm_isdst` == 0 means no DST, `tm_isdst` < 0
-// means use the post-transition offset).
+// for a description of the expected values of the tm fields. If the civil time
+// is unique (see `absl::TimeZone::At(absl::CivilSecond)` above), the matching
+// time instant is returned.  Otherwise, the `tm_isdst` field is consulted to
+// choose between the possible results.  For a repeated civil time, `tm_isdst !=
+// 0` returns the matching DST instant, while `tm_isdst == 0` returns the
+// matching non-DST instant.  For a skipped civil time there is no matching
+// instant, so `tm_isdst != 0` returns the DST instant, and `tm_isdst == 0`
+// returns the non-DST instant, that would have matched if the transition never
+// happened.
 Time FromTM(const struct tm& tm, TimeZone tz);
 
 // ToTM()
@@ -1348,7 +1352,7 @@ constexpr Duration MakeDuration(int64_t hi, int64_t lo) {
 inline Duration MakePosDoubleDuration(double n) {
   const int64_t int_secs = static_cast<int64_t>(n);
   const uint32_t ticks = static_cast<uint32_t>(
-      (n - static_cast<double>(int_secs)) * kTicksPerSecond + 0.5);
+      std::round((n - static_cast<double>(int_secs)) * kTicksPerSecond));
   return ticks < kTicksPerSecond
              ? MakeDuration(int_secs, ticks)
              : MakeDuration(int_secs + 1, ticks - kTicksPerSecond);
diff --git a/third_party/abseil-cpp/absl/types/CMakeLists.txt b/third_party/abseil-cpp/absl/types/CMakeLists.txt
index c356b2117d..d7e8614e0d 100644
--- a/third_party/abseil-cpp/absl/types/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/types/CMakeLists.txt
@@ -69,7 +69,7 @@ absl_cc_test(
     absl::exception_testing
     absl::raw_logging_internal
     absl::test_instance_tracker
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -85,7 +85,7 @@ absl_cc_test(
     absl::exception_testing
     absl::raw_logging_internal
     absl::test_instance_tracker
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -99,7 +99,7 @@ absl_cc_test(
     absl::any
     absl::config
     absl::exception_safety_testing
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -136,7 +136,7 @@ absl_cc_test(
     absl::inlined_vector
     absl::hash_testing
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -156,7 +156,7 @@ absl_cc_test(
     absl::inlined_vector
     absl::hash_testing
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -222,7 +222,7 @@ absl_cc_test(
     absl::raw_logging_internal
     absl::strings
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -236,7 +236,7 @@ absl_cc_test(
     absl::optional
     absl::config
     absl::exception_safety_testing
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -258,7 +258,7 @@ absl_cc_library(
     absl::type_traits
     absl::strings
     absl::utility
-    gmock_main
+    GTest::gmock_main
   TESTONLY
 )
 
@@ -275,7 +275,7 @@ absl_cc_test(
   DEPS
     absl::conformance_testing
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -288,7 +288,7 @@ absl_cc_test(
   DEPS
     absl::conformance_testing
     absl::type_traits
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -324,7 +324,7 @@ absl_cc_test(
     absl::memory
     absl::type_traits
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_library(
@@ -350,7 +350,7 @@ absl_cc_test(
   DEPS
     absl::base
     absl::compare
-    gmock_main
+    GTest::gmock_main
 )
 
 absl_cc_test(
@@ -365,5 +365,5 @@ absl_cc_test(
     absl::config
     absl::exception_safety_testing
     absl::memory
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/abseil-cpp/absl/types/span.h b/third_party/abseil-cpp/absl/types/span.h
index 95fe79262d..41db3420db 100644
--- a/third_party/abseil-cpp/absl/types/span.h
+++ b/third_party/abseil-cpp/absl/types/span.h
@@ -243,8 +243,8 @@ class Span {
   //
   template <typename LazyT = T,
             typename = EnableIfConstView<LazyT>>
-  Span(
-      std::initializer_list<value_type> v) noexcept  // NOLINT(runtime/explicit)
+  Span(std::initializer_list<value_type> v
+           ABSL_ATTRIBUTE_LIFETIME_BOUND) noexcept  // NOLINT(runtime/explicit)
       : Span(v.begin(), v.size()) {}
 
   // Accessors
diff --git a/third_party/abseil-cpp/absl/utility/CMakeLists.txt b/third_party/abseil-cpp/absl/utility/CMakeLists.txt
index e1edd19aa0..865b758f23 100644
--- a/third_party/abseil-cpp/absl/utility/CMakeLists.txt
+++ b/third_party/abseil-cpp/absl/utility/CMakeLists.txt
@@ -40,5 +40,5 @@ absl_cc_test(
     absl::core_headers
     absl::memory
     absl::strings
-    gmock_main
+    GTest::gmock_main
 )
diff --git a/third_party/crc32c/CMakeLists.txt b/third_party/crc32c/CMakeLists.txt
new file mode 100644
index 0000000000..bc720892d7
--- /dev/null
+++ b/third_party/crc32c/CMakeLists.txt
@@ -0,0 +1,66 @@
+android_add_library(
+  TARGET
+  crc32c
+  LICENSE
+  "BSD-3-Clause"
+  SRC
+  src/src/crc32c.cc
+  src/src/crc32c_portable.cc)
+target_include_directories(crc32c PUBLIC config src/include)
+target_compile_definitions(crc32c PRIVATE BYTE_ORDER_BIG_ENDIAN=0
+                                          CRC32C_TESTS_BUILT_WITH_GLOG=0)
+if(LINUX_AARCH64 OR DARWIN_AARCH64)
+  target_compile_definitions(crc32c PRIVATE HAVE_MM_PREFETCH=0 HAVE_SSE42=0)
+  target_sources(crc32c PRIVATE src/src/crc32c_arm64.cc)
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    target_compile_definitions(crc32c PRIVATE HAVE_ARM64_CRC32C=0)
+    target_compile_options(
+      crc32c
+      PRIVATE "-march=armv8-a"
+              # Some builds set -march to a different value from the above. The
+              # specific feature flags below enable the instructions we need in
+              # these cases. See https://crbug.com/934016 for example.
+              "-Xclang -target-feature"
+              "-Xclang +crc"
+              "-Xclang -target-feature"
+              "-Xclang +crypto")
+  else()
+    target_compile_options(crc32c PRIVATE "-march=armv8-a+crc+crypto")
+    target_compile_definitions(crc32c PRIVATE HAVE_ARM64_CRC32C=1)
+  endif()
+else()
+  target_compile_definitions(crc32c PRIVATE HAVE_MM_PREFETCH=1 HAVE_SSE42=1)
+  target_compile_definitions(crc32c PRIVATE HAVE_ARM64_CRC32C=0)
+  target_sources(crc32c PRIVATE src/src/crc32c_sse42.cc)
+  if(WINDOWS_MSVC_X86_64)
+    target_compile_options(crc32c PRIVATE -mavx)
+  else()
+    target_compile_options(crc32c PRIVATE -msse4.2)
+  endif()
+endif()
+
+target_compile_definitions(crc32c PRIVATE HAVE_BUILTIN_PREFETCH=1)
+
+if(LINUX_AARCH64 OR LINUX_X86_64)
+  target_compile_definitions(crc32c PRIVATE HAVE_STRONG_GETAUXVAL=1
+                                            HAVE_WEAK_GETAUXVAL=1)
+
+else()
+  target_compile_definitions(crc32c PRIVATE HAVE_STRONG_GETAUXVAL=0
+                                            HAVE_WEAK_GETAUXVAL=0)
+endif()
+
+android_add_test(
+  TARGET
+  crc32c_tests
+  SRC
+  "src/src/crc32c_arm64_unittest.cc"
+  "src/src/crc32c_extend_unittests.h"
+  "src/src/crc32c_portable_unittest.cc"
+  "src/src/crc32c_prefetch_unittest.cc"
+  "src/src/crc32c_read_le_unittest.cc"
+  "src/src/crc32c_round_up_unittest.cc"
+  "src/src/crc32c_sse42_unittest.cc"
+  "src/src/crc32c_unittest.cc")
+
+target_link_libraries(crc32c_tests PRIVATE crc32c gmock_main)
diff --git a/third_party/crc32c/config/crc32c/crc32c_config.h b/third_party/crc32c/config/crc32c/crc32c_config.h
new file mode 100644
index 0000000000..3589fa678c
--- /dev/null
+++ b/third_party/crc32c/config/crc32c/crc32c_config.h
@@ -0,0 +1,6 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This is a stub. The preprocessor macros that are usually defined here are
+// supplied by BUILD.gn instead.
diff --git a/third_party/crc32c/src/include/crc32c/crc32c.h b/third_party/crc32c/src/include/crc32c/crc32c.h
new file mode 100644
index 0000000000..e8a78170a9
--- /dev/null
+++ b/third_party/crc32c/src/include/crc32c/crc32c.h
@@ -0,0 +1,89 @@
+/* Copyright 2017 The CRC32C Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#ifndef CRC32C_CRC32C_H_
+#define CRC32C_CRC32C_H_
+
+/* The API exported by the CRC32C project. */
+
+#if defined(__cplusplus)
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#else  /* !defined(__cplusplus) */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#endif  /* !defined(__cplusplus) */
+
+
+/* The C API. */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif  /* defined(__cplusplus) */
+
+/* Extends "crc" with the CRC32C of "count" bytes in the buffer pointed by
+   "data" */
+uint32_t crc32c_extend(uint32_t crc, const uint8_t* data, size_t count);
+
+/* Computes the CRC32C of "count" bytes in the buffer pointed by "data". */
+uint32_t crc32c_value(const uint8_t* data, size_t count);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif  /* defined(__cplusplus) */
+
+
+/* The C++ API. */
+
+#if defined(__cplusplus)
+
+namespace crc32c {
+
+// Extends "crc" with the CRC32C of "count" bytes in the buffer pointed by
+// "data".
+uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count);
+
+// Computes the CRC32C of "count" bytes in the buffer pointed by "data".
+inline uint32_t Crc32c(const uint8_t* data, size_t count) {
+  return Extend(0, data, count);
+}
+
+// Computes the CRC32C of "count" bytes in the buffer pointed by "data".
+inline uint32_t Crc32c(const char* data, size_t count) {
+  return Extend(0, reinterpret_cast<const uint8_t*>(data), count);
+}
+
+// Computes the CRC32C of the string's content.
+inline uint32_t Crc32c(const std::string& string) {
+  return Crc32c(reinterpret_cast<const uint8_t*>(string.data()),
+                string.size());
+}
+
+}  // namespace crc32c
+
+#if __cplusplus > 201402L
+#if __has_include(<string_view>)
+#include <string_view>
+
+namespace crc32c {
+
+// Computes the CRC32C of the bytes in the string_view.
+inline uint32_t Crc32c(const std::string_view& string_view) {
+  return Crc32c(reinterpret_cast<const uint8_t*>(string_view.data()),
+                string_view.size());
+}
+
+}  // namespace crc32c
+
+#endif  // __has_include(<string_view>)
+#endif  // __cplusplus > 201402L
+
+#endif  /* defined(__cplusplus) */
+
+#endif  // CRC32C_CRC32C_H_
diff --git a/third_party/crc32c/src/src/crc32c.cc b/third_party/crc32c/src/src/crc32c.cc
new file mode 100644
index 0000000000..4d3018af47
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c.cc
@@ -0,0 +1,39 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "crc32c/crc32c.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "./crc32c_arm64.h"
+#include "./crc32c_arm64_linux_check.h"
+#include "./crc32c_internal.h"
+#include "./crc32c_sse42.h"
+#include "./crc32c_sse42_check.h"
+
+namespace crc32c {
+
+uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) {
+#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+  static bool can_use_sse42 = CanUseSse42();
+  if (can_use_sse42) return ExtendSse42(crc, data, count);
+#elif HAVE_ARM64_CRC32C
+  static bool can_use_arm_linux = CanUseArm64Linux();
+  if (can_use_arm_linux) return ExtendArm64(crc, data, count);
+#endif  // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+  return ExtendPortable(crc, data, count);
+}
+
+extern "C" uint32_t crc32c_extend(uint32_t crc, const uint8_t* data,
+                                  size_t count) {
+  return crc32c::Extend(crc, data, count);
+}
+
+extern "C" uint32_t crc32c_value(const uint8_t* data, size_t count) {
+  return crc32c::Crc32c(data, count);
+}
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_arm64.cc b/third_party/crc32c/src/src/crc32c_arm64.cc
new file mode 100644
index 0000000000..9a988c1eed
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_arm64.cc
@@ -0,0 +1,124 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_arm64.h"
+
+// In a separate source file to allow this accelerated CRC32C function to be
+// compiled with the appropriate compiler flags to enable ARM NEON CRC32C
+// instructions.
+
+// This implementation is based on https://github.com/google/leveldb/pull/490.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "./crc32c_internal.h"
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_ARM64_CRC32C
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define KBYTES 1032
+#define SEGMENTBYTES 256
+
+// compute 8bytes for each segment parallelly
+#define CRC32C32BYTES(P, IND)                                             \
+  do {                                                                    \
+    crc1 = __crc32cd(                                                     \
+        crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
+    crc2 = __crc32cd(                                                     \
+        crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
+    crc3 = __crc32cd(                                                     \
+        crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
+    crc0 = __crc32cd(                                                     \
+        crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
+  } while (0);
+
+// compute 8*8 bytes for each segment parallelly
+#define CRC32C256BYTES(P, IND)      \
+  do {                              \
+    CRC32C32BYTES((P), (IND)*8 + 0) \
+    CRC32C32BYTES((P), (IND)*8 + 1) \
+    CRC32C32BYTES((P), (IND)*8 + 2) \
+    CRC32C32BYTES((P), (IND)*8 + 3) \
+    CRC32C32BYTES((P), (IND)*8 + 4) \
+    CRC32C32BYTES((P), (IND)*8 + 5) \
+    CRC32C32BYTES((P), (IND)*8 + 6) \
+    CRC32C32BYTES((P), (IND)*8 + 7) \
+  } while (0);
+
+// compute 4*8*8 bytes for each segment parallelly
+#define CRC32C1024BYTES(P)   \
+  do {                       \
+    CRC32C256BYTES((P), 0)   \
+    CRC32C256BYTES((P), 1)   \
+    CRC32C256BYTES((P), 2)   \
+    CRC32C256BYTES((P), 3)   \
+    (P) += 4 * SEGMENTBYTES; \
+  } while (0)
+
+namespace crc32c {
+
+uint32_t ExtendArm64(uint32_t crc, const uint8_t *buf, size_t size) {
+  int64_t length = size;
+  uint32_t crc0, crc1, crc2, crc3;
+  uint64_t t0, t1, t2;
+
+  // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)),
+  // k2=CRC(x^(SEGMENTBYTES*8))
+  const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4;
+
+  crc = crc ^ kCRC32Xor;
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+
+  while (length >= KBYTES) {
+    crc0 = crc;
+    crc1 = 0;
+    crc2 = 0;
+    crc3 = 0;
+
+    // Process 1024 bytes in parallel.
+    CRC32C1024BYTES(p);
+
+    // Merge the 4 partial CRC32C values.
+    t2 = (uint64_t)vmull_p64(crc2, k2);
+    t1 = (uint64_t)vmull_p64(crc1, k1);
+    t0 = (uint64_t)vmull_p64(crc0, k0);
+    crc = __crc32cd(crc3, *(uint64_t *)p);
+    p += sizeof(uint64_t);
+    crc ^= __crc32cd(0, t2);
+    crc ^= __crc32cd(0, t1);
+    crc ^= __crc32cd(0, t0);
+
+    length -= KBYTES;
+  }
+
+  while (length >= 8) {
+    crc = __crc32cd(crc, *(uint64_t *)p);
+    p += 8;
+    length -= 8;
+  }
+
+  if (length & 4) {
+    crc = __crc32cw(crc, *(uint32_t *)p);
+    p += 4;
+  }
+
+  if (length & 2) {
+    crc = __crc32ch(crc, *(uint16_t *)p);
+    p += 2;
+  }
+
+  if (length & 1) {
+    crc = __crc32cb(crc, *p);
+  }
+
+  return crc ^ kCRC32Xor;
+}
+
+}  // namespace crc32c
+
+#endif  // HAVE_ARM64_CRC32C
diff --git a/third_party/crc32c/src/src/crc32c_arm64.h b/third_party/crc32c/src/src/crc32c_arm64.h
new file mode 100644
index 0000000000..bbdece46c7
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_arm64.h
@@ -0,0 +1,25 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Linux-specific code checking the availability for ARM CRC32C instructions.
+
+#ifndef CRC32C_CRC32C_ARM_LINUX_H_
+#define CRC32C_CRC32C_ARM_LINUX_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_ARM64_CRC32C
+
+namespace crc32c {
+
+uint32_t ExtendArm64(uint32_t crc, const uint8_t* data, size_t count);
+
+}  // namespace crc32c
+
+#endif  // HAVE_ARM64_CRC32C
+
+#endif  // CRC32C_CRC32C_ARM_LINUX_H_
diff --git a/third_party/crc32c/src/src/crc32c_arm64_linux_check.h b/third_party/crc32c/src/src/crc32c_arm64_linux_check.h
new file mode 100644
index 0000000000..6817979aac
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_arm64_linux_check.h
@@ -0,0 +1,48 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// ARM Linux-specific code checking for the availability of CRC32C instructions.
+
+#ifndef CRC32C_CRC32C_ARM_LINUX_CHECK_H_
+#define CRC32C_CRC32C_ARM_LINUX_CHECK_H_
+
+// X86-specific code checking for the availability of SSE4.2 instructions.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_ARM64_CRC32C
+
+#if HAVE_STRONG_GETAUXVAL
+#include <sys/auxv.h>
+#elif HAVE_WEAK_GETAUXVAL
+// getauxval() is not available on Android until API level 20. Link it as a weak
+// symbol.
+extern "C" unsigned long getauxval(unsigned long type) __attribute__((weak));
+
+#define AT_HWCAP 16
+#endif  // HAVE_STRONG_GETAUXVAL || HAVE_WEAK_GETAUXVAL
+
+namespace crc32c {
+
+inline bool CanUseArm64Linux() {
+#if HAVE_STRONG_GETAUXVAL || HAVE_WEAK_GETAUXVAL
+  // From 'arch/arm64/include/uapi/asm/hwcap.h' in Linux kernel source code.
+  constexpr unsigned long kHWCAP_PMULL = 1 << 4;
+  constexpr unsigned long kHWCAP_CRC32 = 1 << 7;
+  unsigned long hwcap = (&getauxval != nullptr) ? getauxval(AT_HWCAP) : 0;
+  return (hwcap & (kHWCAP_PMULL | kHWCAP_CRC32)) ==
+         (kHWCAP_PMULL | kHWCAP_CRC32);
+#else
+  return false;
+#endif  // HAVE_STRONG_GETAUXVAL || HAVE_WEAK_GETAUXVAL
+}
+
+}  // namespace crc32c
+
+#endif  // HAVE_ARM64_CRC32C
+
+#endif  // CRC32C_CRC32C_ARM_LINUX_CHECK_H_
diff --git a/third_party/crc32c/src/src/crc32c_arm64_unittest.cc b/third_party/crc32c/src/src/crc32c_arm64_unittest.cc
new file mode 100644
index 0000000000..6f917d9c0c
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_arm64_unittest.cc
@@ -0,0 +1,24 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "gtest/gtest.h"
+
+#include "./crc32c_arm64.h"
+#include "./crc32c_extend_unittests.h"
+
+namespace crc32c {
+
+#if HAVE_ARM64_CRC32C
+
+struct Arm64TestTraits {
+  static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) {
+    return ExtendArm64(crc, data, count);
+  }
+};
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Arm64, ExtendTest, Arm64TestTraits);
+
+#endif  // HAVE_ARM64_CRC32C
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_benchmark.cc b/third_party/crc32c/src/src/crc32c_benchmark.cc
new file mode 100644
index 0000000000..d31af92256
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_benchmark.cc
@@ -0,0 +1,104 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "crc32c/crc32c_config.h"
+
+#include "benchmark/benchmark.h"
+
+#if CRC32C_TESTS_BUILT_WITH_GLOG
+#include "glog/logging.h"
+#endif  // CRC32C_TESTS_BUILT_WITH_GLOG
+
+#include "./crc32c_arm64.h"
+#include "./crc32c_arm64_linux_check.h"
+#include "./crc32c_internal.h"
+#include "./crc32c_sse42.h"
+#include "./crc32c_sse42_check.h"
+#include "crc32c/crc32c.h"
+
+class CRC32CBenchmark : public benchmark::Fixture {
+ public:
+  void SetUp(const benchmark::State& state) override {
+    block_size_ = static_cast<size_t>(state.range(0));
+    block_data_ = std::string(block_size_, 'x');
+    block_buffer_ = reinterpret_cast<const uint8_t*>(block_data_.data());
+  }
+
+ protected:
+  std::string block_data_;
+  const uint8_t* block_buffer_;
+  size_t block_size_;
+};
+
+BENCHMARK_DEFINE_F(CRC32CBenchmark, Public)(benchmark::State& state) {
+  uint32_t crc = 0;
+  for (auto _ : state)
+    crc = crc32c::Extend(crc, block_buffer_, block_size_);
+  state.SetBytesProcessed(state.iterations() * block_size_);
+}
+BENCHMARK_REGISTER_F(CRC32CBenchmark, Public)
+    ->RangeMultiplier(16)
+    ->Range(256, 16777216);  // Block size.
+
+BENCHMARK_DEFINE_F(CRC32CBenchmark, Portable)(benchmark::State& state) {
+  uint32_t crc = 0;
+  for (auto _ : state)
+    crc = crc32c::ExtendPortable(crc, block_buffer_, block_size_);
+  state.SetBytesProcessed(state.iterations() * block_size_);
+}
+BENCHMARK_REGISTER_F(CRC32CBenchmark, Portable)
+    ->RangeMultiplier(16)
+    ->Range(256, 16777216);  // Block size.
+
+#if HAVE_ARM64_CRC32C
+
+BENCHMARK_DEFINE_F(CRC32CBenchmark, ArmLinux)(benchmark::State& state) {
+  if (!crc32c::CanUseArm64Linux()) {
+    state.SkipWithError("ARM CRC32C instructions not available or not enabled");
+    return;
+  }
+
+  uint32_t crc = 0;
+  for (auto _ : state)
+    crc = crc32c::ExtendArm64(crc, block_buffer_, block_size_);
+  state.SetBytesProcessed(state.iterations() * block_size_);
+}
+BENCHMARK_REGISTER_F(CRC32CBenchmark, ArmLinux)
+    ->RangeMultiplier(16)
+    ->Range(256, 16777216);  // Block size.
+
+#endif  // HAVE_ARM64_CRC32C
+
+#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+BENCHMARK_DEFINE_F(CRC32CBenchmark, Sse42)(benchmark::State& state) {
+  if (!crc32c::CanUseSse42()) {
+    state.SkipWithError("SSE4.2 instructions not available or not enabled");
+    return;
+  }
+
+  uint32_t crc = 0;
+  for (auto _ : state)
+    crc = crc32c::ExtendSse42(crc, block_buffer_, block_size_);
+  state.SetBytesProcessed(state.iterations() * block_size_);
+}
+BENCHMARK_REGISTER_F(CRC32CBenchmark, Sse42)
+    ->RangeMultiplier(16)
+    ->Range(256, 16777216);  // Block size.
+
+#endif  // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+int main(int argc, char** argv) {
+#if CRC32C_TESTS_BUILT_WITH_GLOG
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+#endif  // CRC32C_TESTS_BUILT_WITH_GLOG
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
diff --git a/third_party/crc32c/src/src/crc32c_capi_unittest.c b/third_party/crc32c/src/src/crc32c_capi_unittest.c
new file mode 100644
index 0000000000..c8993a0959
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_capi_unittest.c
@@ -0,0 +1,66 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "crc32c/crc32c.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main() {
+  /* From rfc3720 section B.4. */
+  uint8_t buf[32];
+
+  memset(buf, 0, sizeof(buf));
+  if ((uint32_t)0x8a9136aa != crc32c_value(buf, sizeof(buf))) {
+    printf("crc32c_value(zeros) test failed\n");
+    return 1;
+  }
+
+  memset(buf, 0xff, sizeof(buf));
+  if ((uint32_t)0x62a8ab43 != crc32c_value(buf, sizeof(buf))) {
+    printf("crc32c_value(0xff) test failed\n");
+    return 1;
+  }
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = (uint8_t)i;
+  if ((uint32_t)0x46dd794e != crc32c_value(buf, sizeof(buf))) {
+    printf("crc32c_value(0..31) test failed\n");
+    return 1;
+  }
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = (uint8_t)(31 - i);
+  if ((uint32_t)0x113fdb5c != crc32c_value(buf, sizeof(buf))) {
+    printf("crc32c_value(31..0) test failed\n");
+    return 1;
+  }
+
+  uint8_t data[48] = {
+      0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+      0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+  if ((uint32_t)0xd9963a56 != crc32c_value(data, sizeof(data))) {
+    printf("crc32c_value(31..0) test failed\n");
+    return 1;
+  }
+
+  const uint8_t* hello_space_world = (const uint8_t*)"hello world";
+  const uint8_t* hello_space = (const uint8_t*)"hello ";
+  const uint8_t* world = (const uint8_t*)"world";
+
+  if (crc32c_value(hello_space_world, 11) !=
+      crc32c_extend(crc32c_value(hello_space, 6), world, 5)) {
+    printf("crc32c_extend test failed\n");
+    return 1;
+  }
+
+  printf("All tests passed\n");
+  return 0;
+}
diff --git a/third_party/crc32c/src/src/crc32c_extend_unittests.h b/third_party/crc32c/src/src/crc32c_extend_unittests.h
new file mode 100644
index 0000000000..0732973737
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_extend_unittests.h
@@ -0,0 +1,112 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_EXTEND_UNITTESTS_H_
+#define CRC32C_CRC32C_EXTEND_UNITTESTS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "gtest/gtest.h"
+
+// Common test cases for all implementations of CRC32C_Extend().
+
+namespace crc32c {
+
+template<typename TestTraits>
+class ExtendTest : public testing::Test {};
+
+TYPED_TEST_SUITE_P(ExtendTest);
+
+TYPED_TEST_P(ExtendTest, StandardResults) {
+  // From rfc3720 section B.4.
+  uint8_t buf[32];
+
+  std::memset(buf, 0, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa),
+            TypeParam::Extend(0, buf, sizeof(buf)));
+
+  std::memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43),
+            TypeParam::Extend(0, buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; ++i)
+    buf[i] = static_cast<uint8_t>(i);
+  EXPECT_EQ(static_cast<uint32_t>(0x46dd794e),
+            TypeParam::Extend(0, buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; ++i)
+    buf[i] = static_cast<uint8_t>(31 - i);
+  EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c),
+            TypeParam::Extend(0, buf, sizeof(buf)));
+
+  uint8_t data[48] = {
+      0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+      0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+  EXPECT_EQ(static_cast<uint32_t>(0xd9963a56),
+            TypeParam::Extend(0, data, sizeof(data)));
+}
+
+TYPED_TEST_P(ExtendTest, HelloWorld) {
+  const uint8_t* hello_space_world =
+      reinterpret_cast<const uint8_t*>("hello world");
+  const uint8_t* hello_space = reinterpret_cast<const uint8_t*>("hello ");
+  const uint8_t* world = reinterpret_cast<const uint8_t*>("world");
+
+  EXPECT_EQ(TypeParam::Extend(0, hello_space_world, 11),
+            TypeParam::Extend(TypeParam::Extend(0, hello_space, 6), world, 5));
+}
+
+TYPED_TEST_P(ExtendTest, BufferSlicing) {
+  uint8_t buffer[48] = {
+      0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+      0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+
+  for (size_t i = 0; i < 48; ++i) {
+    for (size_t j = i + 1; j <= 48; ++j) {
+      uint32_t crc = 0;
+
+      if (i > 0) crc = TypeParam::Extend(crc, buffer, i);
+      crc = TypeParam::Extend(crc, buffer + i, j - i);
+      if (j < 48) crc = TypeParam::Extend(crc, buffer + j, 48 - j);
+
+      EXPECT_EQ(static_cast<uint32_t>(0xd9963a56), crc);
+    }
+  }
+}
+
+TYPED_TEST_P(ExtendTest, LargeBufferSlicing) {
+  uint8_t buffer[2048];
+  for (size_t i = 0; i < 2048; i++)
+    buffer[i] = static_cast<uint8_t>(3 * i * i + 7 * i + 11);
+
+  for (size_t i = 0; i < 2048; ++i) {
+    for (size_t j = i + 1; j <= 2048; ++j) {
+      uint32_t crc = 0;
+
+      if (i > 0) crc = TypeParam::Extend(crc, buffer, i);
+      crc = TypeParam::Extend(crc, buffer + i, j - i);
+      if (j < 2048) crc = TypeParam::Extend(crc, buffer + j, 2048 - j);
+
+      EXPECT_EQ(static_cast<uint32_t>(0x36dcc753), crc);
+    }
+  }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ExtendTest,
+    StandardResults,
+    HelloWorld,
+    BufferSlicing,
+    LargeBufferSlicing);
+
+}  // namespace crc32c
+
+#endif  // CRC32C_CRC32C_EXTEND_UNITTESTS_H_
diff --git a/third_party/crc32c/src/src/crc32c_internal.h b/third_party/crc32c/src/src/crc32c_internal.h
new file mode 100644
index 0000000000..2bd23dea43
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_internal.h
@@ -0,0 +1,23 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_INTERNAL_H_
+#define CRC32C_CRC32C_INTERNAL_H_
+
+// Internal functions that may change between releases.
+
+#include <cstddef>
+#include <cstdint>
+
+namespace crc32c {
+
+// Un-accelerated implementation that works on all CPUs.
+uint32_t ExtendPortable(uint32_t crc, const uint8_t* data, size_t count);
+
+// CRCs are pre- and post- conditioned by xoring with all ones.
+static constexpr const uint32_t kCRC32Xor = static_cast<uint32_t>(0xffffffffU);
+
+}  // namespace crc32c
+
+#endif  // CRC32C_CRC32C_INTERNAL_H_
diff --git a/third_party/crc32c/src/src/crc32c_portable.cc b/third_party/crc32c/src/src/crc32c_portable.cc
new file mode 100644
index 0000000000..31ec6eac53
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_portable.cc
@@ -0,0 +1,351 @@
+// Copyright 2008 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_internal.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "./crc32c_prefetch.h"
+#include "./crc32c_read_le.h"
+#include "./crc32c_round_up.h"
+
+namespace {
+
+const uint32_t kByteExtensionTable[256] = {
+    0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
+    0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+    0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c,
+    0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+    0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc,
+    0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+    0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512,
+    0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+    0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad,
+    0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+    0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf,
+    0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+    0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f,
+    0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+    0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f,
+    0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+    0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e,
+    0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+    0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e,
+    0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+    0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de,
+    0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+    0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4,
+    0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+    0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b,
+    0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+    0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5,
+    0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+    0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975,
+    0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+    0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905,
+    0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+    0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8,
+    0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+    0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8,
+    0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+    0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78,
+    0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+    0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6,
+    0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+    0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69,
+    0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+    0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351};
+
+const uint32_t kStrideExtensionTable0[256] = {
+    0x00000000, 0x30d23865, 0x61a470ca, 0x517648af, 0xc348e194, 0xf39ad9f1,
+    0xa2ec915e, 0x923ea93b, 0x837db5d9, 0xb3af8dbc, 0xe2d9c513, 0xd20bfd76,
+    0x4035544d, 0x70e76c28, 0x21912487, 0x11431ce2, 0x03171d43, 0x33c52526,
+    0x62b36d89, 0x526155ec, 0xc05ffcd7, 0xf08dc4b2, 0xa1fb8c1d, 0x9129b478,
+    0x806aa89a, 0xb0b890ff, 0xe1ced850, 0xd11ce035, 0x4322490e, 0x73f0716b,
+    0x228639c4, 0x125401a1, 0x062e3a86, 0x36fc02e3, 0x678a4a4c, 0x57587229,
+    0xc566db12, 0xf5b4e377, 0xa4c2abd8, 0x941093bd, 0x85538f5f, 0xb581b73a,
+    0xe4f7ff95, 0xd425c7f0, 0x461b6ecb, 0x76c956ae, 0x27bf1e01, 0x176d2664,
+    0x053927c5, 0x35eb1fa0, 0x649d570f, 0x544f6f6a, 0xc671c651, 0xf6a3fe34,
+    0xa7d5b69b, 0x97078efe, 0x8644921c, 0xb696aa79, 0xe7e0e2d6, 0xd732dab3,
+    0x450c7388, 0x75de4bed, 0x24a80342, 0x147a3b27, 0x0c5c750c, 0x3c8e4d69,
+    0x6df805c6, 0x5d2a3da3, 0xcf149498, 0xffc6acfd, 0xaeb0e452, 0x9e62dc37,
+    0x8f21c0d5, 0xbff3f8b0, 0xee85b01f, 0xde57887a, 0x4c692141, 0x7cbb1924,
+    0x2dcd518b, 0x1d1f69ee, 0x0f4b684f, 0x3f99502a, 0x6eef1885, 0x5e3d20e0,
+    0xcc0389db, 0xfcd1b1be, 0xada7f911, 0x9d75c174, 0x8c36dd96, 0xbce4e5f3,
+    0xed92ad5c, 0xdd409539, 0x4f7e3c02, 0x7fac0467, 0x2eda4cc8, 0x1e0874ad,
+    0x0a724f8a, 0x3aa077ef, 0x6bd63f40, 0x5b040725, 0xc93aae1e, 0xf9e8967b,
+    0xa89eded4, 0x984ce6b1, 0x890ffa53, 0xb9ddc236, 0xe8ab8a99, 0xd879b2fc,
+    0x4a471bc7, 0x7a9523a2, 0x2be36b0d, 0x1b315368, 0x096552c9, 0x39b76aac,
+    0x68c12203, 0x58131a66, 0xca2db35d, 0xfaff8b38, 0xab89c397, 0x9b5bfbf2,
+    0x8a18e710, 0xbacadf75, 0xebbc97da, 0xdb6eafbf, 0x49500684, 0x79823ee1,
+    0x28f4764e, 0x18264e2b, 0x18b8ea18, 0x286ad27d, 0x791c9ad2, 0x49cea2b7,
+    0xdbf00b8c, 0xeb2233e9, 0xba547b46, 0x8a864323, 0x9bc55fc1, 0xab1767a4,
+    0xfa612f0b, 0xcab3176e, 0x588dbe55, 0x685f8630, 0x3929ce9f, 0x09fbf6fa,
+    0x1baff75b, 0x2b7dcf3e, 0x7a0b8791, 0x4ad9bff4, 0xd8e716cf, 0xe8352eaa,
+    0xb9436605, 0x89915e60, 0x98d24282, 0xa8007ae7, 0xf9763248, 0xc9a40a2d,
+    0x5b9aa316, 0x6b489b73, 0x3a3ed3dc, 0x0aecebb9, 0x1e96d09e, 0x2e44e8fb,
+    0x7f32a054, 0x4fe09831, 0xddde310a, 0xed0c096f, 0xbc7a41c0, 0x8ca879a5,
+    0x9deb6547, 0xad395d22, 0xfc4f158d, 0xcc9d2de8, 0x5ea384d3, 0x6e71bcb6,
+    0x3f07f419, 0x0fd5cc7c, 0x1d81cddd, 0x2d53f5b8, 0x7c25bd17, 0x4cf78572,
+    0xdec92c49, 0xee1b142c, 0xbf6d5c83, 0x8fbf64e6, 0x9efc7804, 0xae2e4061,
+    0xff5808ce, 0xcf8a30ab, 0x5db49990, 0x6d66a1f5, 0x3c10e95a, 0x0cc2d13f,
+    0x14e49f14, 0x2436a771, 0x7540efde, 0x4592d7bb, 0xd7ac7e80, 0xe77e46e5,
+    0xb6080e4a, 0x86da362f, 0x97992acd, 0xa74b12a8, 0xf63d5a07, 0xc6ef6262,
+    0x54d1cb59, 0x6403f33c, 0x3575bb93, 0x05a783f6, 0x17f38257, 0x2721ba32,
+    0x7657f29d, 0x4685caf8, 0xd4bb63c3, 0xe4695ba6, 0xb51f1309, 0x85cd2b6c,
+    0x948e378e, 0xa45c0feb, 0xf52a4744, 0xc5f87f21, 0x57c6d61a, 0x6714ee7f,
+    0x3662a6d0, 0x06b09eb5, 0x12caa592, 0x22189df7, 0x736ed558, 0x43bced3d,
+    0xd1824406, 0xe1507c63, 0xb02634cc, 0x80f40ca9, 0x91b7104b, 0xa165282e,
+    0xf0136081, 0xc0c158e4, 0x52fff1df, 0x622dc9ba, 0x335b8115, 0x0389b970,
+    0x11ddb8d1, 0x210f80b4, 0x7079c81b, 0x40abf07e, 0xd2955945, 0xe2476120,
+    0xb331298f, 0x83e311ea, 0x92a00d08, 0xa272356d, 0xf3047dc2, 0xc3d645a7,
+    0x51e8ec9c, 0x613ad4f9, 0x304c9c56, 0x009ea433};
+
+const uint32_t kStrideExtensionTable1[256] = {
+    0x00000000, 0x54075546, 0xa80eaa8c, 0xfc09ffca, 0x55f123e9, 0x01f676af,
+    0xfdff8965, 0xa9f8dc23, 0xabe247d2, 0xffe51294, 0x03eced5e, 0x57ebb818,
+    0xfe13643b, 0xaa14317d, 0x561dceb7, 0x021a9bf1, 0x5228f955, 0x062fac13,
+    0xfa2653d9, 0xae21069f, 0x07d9dabc, 0x53de8ffa, 0xafd77030, 0xfbd02576,
+    0xf9cabe87, 0xadcdebc1, 0x51c4140b, 0x05c3414d, 0xac3b9d6e, 0xf83cc828,
+    0x043537e2, 0x503262a4, 0xa451f2aa, 0xf056a7ec, 0x0c5f5826, 0x58580d60,
+    0xf1a0d143, 0xa5a78405, 0x59ae7bcf, 0x0da92e89, 0x0fb3b578, 0x5bb4e03e,
+    0xa7bd1ff4, 0xf3ba4ab2, 0x5a429691, 0x0e45c3d7, 0xf24c3c1d, 0xa64b695b,
+    0xf6790bff, 0xa27e5eb9, 0x5e77a173, 0x0a70f435, 0xa3882816, 0xf78f7d50,
+    0x0b86829a, 0x5f81d7dc, 0x5d9b4c2d, 0x099c196b, 0xf595e6a1, 0xa192b3e7,
+    0x086a6fc4, 0x5c6d3a82, 0xa064c548, 0xf463900e, 0x4d4f93a5, 0x1948c6e3,
+    0xe5413929, 0xb1466c6f, 0x18beb04c, 0x4cb9e50a, 0xb0b01ac0, 0xe4b74f86,
+    0xe6add477, 0xb2aa8131, 0x4ea37efb, 0x1aa42bbd, 0xb35cf79e, 0xe75ba2d8,
+    0x1b525d12, 0x4f550854, 0x1f676af0, 0x4b603fb6, 0xb769c07c, 0xe36e953a,
+    0x4a964919, 0x1e911c5f, 0xe298e395, 0xb69fb6d3, 0xb4852d22, 0xe0827864,
+    0x1c8b87ae, 0x488cd2e8, 0xe1740ecb, 0xb5735b8d, 0x497aa447, 0x1d7df101,
+    0xe91e610f, 0xbd193449, 0x4110cb83, 0x15179ec5, 0xbcef42e6, 0xe8e817a0,
+    0x14e1e86a, 0x40e6bd2c, 0x42fc26dd, 0x16fb739b, 0xeaf28c51, 0xbef5d917,
+    0x170d0534, 0x430a5072, 0xbf03afb8, 0xeb04fafe, 0xbb36985a, 0xef31cd1c,
+    0x133832d6, 0x473f6790, 0xeec7bbb3, 0xbac0eef5, 0x46c9113f, 0x12ce4479,
+    0x10d4df88, 0x44d38ace, 0xb8da7504, 0xecdd2042, 0x4525fc61, 0x1122a927,
+    0xed2b56ed, 0xb92c03ab, 0x9a9f274a, 0xce98720c, 0x32918dc6, 0x6696d880,
+    0xcf6e04a3, 0x9b6951e5, 0x6760ae2f, 0x3367fb69, 0x317d6098, 0x657a35de,
+    0x9973ca14, 0xcd749f52, 0x648c4371, 0x308b1637, 0xcc82e9fd, 0x9885bcbb,
+    0xc8b7de1f, 0x9cb08b59, 0x60b97493, 0x34be21d5, 0x9d46fdf6, 0xc941a8b0,
+    0x3548577a, 0x614f023c, 0x635599cd, 0x3752cc8b, 0xcb5b3341, 0x9f5c6607,
+    0x36a4ba24, 0x62a3ef62, 0x9eaa10a8, 0xcaad45ee, 0x3eced5e0, 0x6ac980a6,
+    0x96c07f6c, 0xc2c72a2a, 0x6b3ff609, 0x3f38a34f, 0xc3315c85, 0x973609c3,
+    0x952c9232, 0xc12bc774, 0x3d2238be, 0x69256df8, 0xc0ddb1db, 0x94dae49d,
+    0x68d31b57, 0x3cd44e11, 0x6ce62cb5, 0x38e179f3, 0xc4e88639, 0x90efd37f,
+    0x39170f5c, 0x6d105a1a, 0x9119a5d0, 0xc51ef096, 0xc7046b67, 0x93033e21,
+    0x6f0ac1eb, 0x3b0d94ad, 0x92f5488e, 0xc6f21dc8, 0x3afbe202, 0x6efcb744,
+    0xd7d0b4ef, 0x83d7e1a9, 0x7fde1e63, 0x2bd94b25, 0x82219706, 0xd626c240,
+    0x2a2f3d8a, 0x7e2868cc, 0x7c32f33d, 0x2835a67b, 0xd43c59b1, 0x803b0cf7,
+    0x29c3d0d4, 0x7dc48592, 0x81cd7a58, 0xd5ca2f1e, 0x85f84dba, 0xd1ff18fc,
+    0x2df6e736, 0x79f1b270, 0xd0096e53, 0x840e3b15, 0x7807c4df, 0x2c009199,
+    0x2e1a0a68, 0x7a1d5f2e, 0x8614a0e4, 0xd213f5a2, 0x7beb2981, 0x2fec7cc7,
+    0xd3e5830d, 0x87e2d64b, 0x73814645, 0x27861303, 0xdb8fecc9, 0x8f88b98f,
+    0x267065ac, 0x727730ea, 0x8e7ecf20, 0xda799a66, 0xd8630197, 0x8c6454d1,
+    0x706dab1b, 0x246afe5d, 0x8d92227e, 0xd9957738, 0x259c88f2, 0x719bddb4,
+    0x21a9bf10, 0x75aeea56, 0x89a7159c, 0xdda040da, 0x74589cf9, 0x205fc9bf,
+    0xdc563675, 0x88516333, 0x8a4bf8c2, 0xde4cad84, 0x2245524e, 0x76420708,
+    0xdfbadb2b, 0x8bbd8e6d, 0x77b471a7, 0x23b324e1};
+
+const uint32_t kStrideExtensionTable2[256] = {
+    0x00000000, 0x678efd01, 0xcf1dfa02, 0xa8930703, 0x9bd782f5, 0xfc597ff4,
+    0x54ca78f7, 0x334485f6, 0x3243731b, 0x55cd8e1a, 0xfd5e8919, 0x9ad07418,
+    0xa994f1ee, 0xce1a0cef, 0x66890bec, 0x0107f6ed, 0x6486e636, 0x03081b37,
+    0xab9b1c34, 0xcc15e135, 0xff5164c3, 0x98df99c2, 0x304c9ec1, 0x57c263c0,
+    0x56c5952d, 0x314b682c, 0x99d86f2f, 0xfe56922e, 0xcd1217d8, 0xaa9cead9,
+    0x020fedda, 0x658110db, 0xc90dcc6c, 0xae83316d, 0x0610366e, 0x619ecb6f,
+    0x52da4e99, 0x3554b398, 0x9dc7b49b, 0xfa49499a, 0xfb4ebf77, 0x9cc04276,
+    0x34534575, 0x53ddb874, 0x60993d82, 0x0717c083, 0xaf84c780, 0xc80a3a81,
+    0xad8b2a5a, 0xca05d75b, 0x6296d058, 0x05182d59, 0x365ca8af, 0x51d255ae,
+    0xf94152ad, 0x9ecfafac, 0x9fc85941, 0xf846a440, 0x50d5a343, 0x375b5e42,
+    0x041fdbb4, 0x639126b5, 0xcb0221b6, 0xac8cdcb7, 0x97f7ee29, 0xf0791328,
+    0x58ea142b, 0x3f64e92a, 0x0c206cdc, 0x6bae91dd, 0xc33d96de, 0xa4b36bdf,
+    0xa5b49d32, 0xc23a6033, 0x6aa96730, 0x0d279a31, 0x3e631fc7, 0x59ede2c6,
+    0xf17ee5c5, 0x96f018c4, 0xf371081f, 0x94fff51e, 0x3c6cf21d, 0x5be20f1c,
+    0x68a68aea, 0x0f2877eb, 0xa7bb70e8, 0xc0358de9, 0xc1327b04, 0xa6bc8605,
+    0x0e2f8106, 0x69a17c07, 0x5ae5f9f1, 0x3d6b04f0, 0x95f803f3, 0xf276fef2,
+    0x5efa2245, 0x3974df44, 0x91e7d847, 0xf6692546, 0xc52da0b0, 0xa2a35db1,
+    0x0a305ab2, 0x6dbea7b3, 0x6cb9515e, 0x0b37ac5f, 0xa3a4ab5c, 0xc42a565d,
+    0xf76ed3ab, 0x90e02eaa, 0x387329a9, 0x5ffdd4a8, 0x3a7cc473, 0x5df23972,
+    0xf5613e71, 0x92efc370, 0xa1ab4686, 0xc625bb87, 0x6eb6bc84, 0x09384185,
+    0x083fb768, 0x6fb14a69, 0xc7224d6a, 0xa0acb06b, 0x93e8359d, 0xf466c89c,
+    0x5cf5cf9f, 0x3b7b329e, 0x2a03aaa3, 0x4d8d57a2, 0xe51e50a1, 0x8290ada0,
+    0xb1d42856, 0xd65ad557, 0x7ec9d254, 0x19472f55, 0x1840d9b8, 0x7fce24b9,
+    0xd75d23ba, 0xb0d3debb, 0x83975b4d, 0xe419a64c, 0x4c8aa14f, 0x2b045c4e,
+    0x4e854c95, 0x290bb194, 0x8198b697, 0xe6164b96, 0xd552ce60, 0xb2dc3361,
+    0x1a4f3462, 0x7dc1c963, 0x7cc63f8e, 0x1b48c28f, 0xb3dbc58c, 0xd455388d,
+    0xe711bd7b, 0x809f407a, 0x280c4779, 0x4f82ba78, 0xe30e66cf, 0x84809bce,
+    0x2c139ccd, 0x4b9d61cc, 0x78d9e43a, 0x1f57193b, 0xb7c41e38, 0xd04ae339,
+    0xd14d15d4, 0xb6c3e8d5, 0x1e50efd6, 0x79de12d7, 0x4a9a9721, 0x2d146a20,
+    0x85876d23, 0xe2099022, 0x878880f9, 0xe0067df8, 0x48957afb, 0x2f1b87fa,
+    0x1c5f020c, 0x7bd1ff0d, 0xd342f80e, 0xb4cc050f, 0xb5cbf3e2, 0xd2450ee3,
+    0x7ad609e0, 0x1d58f4e1, 0x2e1c7117, 0x49928c16, 0xe1018b15, 0x868f7614,
+    0xbdf4448a, 0xda7ab98b, 0x72e9be88, 0x15674389, 0x2623c67f, 0x41ad3b7e,
+    0xe93e3c7d, 0x8eb0c17c, 0x8fb73791, 0xe839ca90, 0x40aacd93, 0x27243092,
+    0x1460b564, 0x73ee4865, 0xdb7d4f66, 0xbcf3b267, 0xd972a2bc, 0xbefc5fbd,
+    0x166f58be, 0x71e1a5bf, 0x42a52049, 0x252bdd48, 0x8db8da4b, 0xea36274a,
+    0xeb31d1a7, 0x8cbf2ca6, 0x242c2ba5, 0x43a2d6a4, 0x70e65352, 0x1768ae53,
+    0xbffba950, 0xd8755451, 0x74f988e6, 0x137775e7, 0xbbe472e4, 0xdc6a8fe5,
+    0xef2e0a13, 0x88a0f712, 0x2033f011, 0x47bd0d10, 0x46bafbfd, 0x213406fc,
+    0x89a701ff, 0xee29fcfe, 0xdd6d7908, 0xbae38409, 0x1270830a, 0x75fe7e0b,
+    0x107f6ed0, 0x77f193d1, 0xdf6294d2, 0xb8ec69d3, 0x8ba8ec25, 0xec261124,
+    0x44b51627, 0x233beb26, 0x223c1dcb, 0x45b2e0ca, 0xed21e7c9, 0x8aaf1ac8,
+    0xb9eb9f3e, 0xde65623f, 0x76f6653c, 0x1178983d};
+
+const uint32_t kStrideExtensionTable3[256] = {
+    0x00000000, 0xf20c0dfe, 0xe1f46d0d, 0x13f860f3, 0xc604aceb, 0x3408a115,
+    0x27f0c1e6, 0xd5fccc18, 0x89e52f27, 0x7be922d9, 0x6811422a, 0x9a1d4fd4,
+    0x4fe183cc, 0xbded8e32, 0xae15eec1, 0x5c19e33f, 0x162628bf, 0xe42a2541,
+    0xf7d245b2, 0x05de484c, 0xd0228454, 0x222e89aa, 0x31d6e959, 0xc3dae4a7,
+    0x9fc30798, 0x6dcf0a66, 0x7e376a95, 0x8c3b676b, 0x59c7ab73, 0xabcba68d,
+    0xb833c67e, 0x4a3fcb80, 0x2c4c517e, 0xde405c80, 0xcdb83c73, 0x3fb4318d,
+    0xea48fd95, 0x1844f06b, 0x0bbc9098, 0xf9b09d66, 0xa5a97e59, 0x57a573a7,
+    0x445d1354, 0xb6511eaa, 0x63add2b2, 0x91a1df4c, 0x8259bfbf, 0x7055b241,
+    0x3a6a79c1, 0xc866743f, 0xdb9e14cc, 0x29921932, 0xfc6ed52a, 0x0e62d8d4,
+    0x1d9ab827, 0xef96b5d9, 0xb38f56e6, 0x41835b18, 0x527b3beb, 0xa0773615,
+    0x758bfa0d, 0x8787f7f3, 0x947f9700, 0x66739afe, 0x5898a2fc, 0xaa94af02,
+    0xb96ccff1, 0x4b60c20f, 0x9e9c0e17, 0x6c9003e9, 0x7f68631a, 0x8d646ee4,
+    0xd17d8ddb, 0x23718025, 0x3089e0d6, 0xc285ed28, 0x17792130, 0xe5752cce,
+    0xf68d4c3d, 0x048141c3, 0x4ebe8a43, 0xbcb287bd, 0xaf4ae74e, 0x5d46eab0,
+    0x88ba26a8, 0x7ab62b56, 0x694e4ba5, 0x9b42465b, 0xc75ba564, 0x3557a89a,
+    0x26afc869, 0xd4a3c597, 0x015f098f, 0xf3530471, 0xe0ab6482, 0x12a7697c,
+    0x74d4f382, 0x86d8fe7c, 0x95209e8f, 0x672c9371, 0xb2d05f69, 0x40dc5297,
+    0x53243264, 0xa1283f9a, 0xfd31dca5, 0x0f3dd15b, 0x1cc5b1a8, 0xeec9bc56,
+    0x3b35704e, 0xc9397db0, 0xdac11d43, 0x28cd10bd, 0x62f2db3d, 0x90fed6c3,
+    0x8306b630, 0x710abbce, 0xa4f677d6, 0x56fa7a28, 0x45021adb, 0xb70e1725,
+    0xeb17f41a, 0x191bf9e4, 0x0ae39917, 0xf8ef94e9, 0x2d1358f1, 0xdf1f550f,
+    0xcce735fc, 0x3eeb3802, 0xb13145f8, 0x433d4806, 0x50c528f5, 0xa2c9250b,
+    0x7735e913, 0x8539e4ed, 0x96c1841e, 0x64cd89e0, 0x38d46adf, 0xcad86721,
+    0xd92007d2, 0x2b2c0a2c, 0xfed0c634, 0x0cdccbca, 0x1f24ab39, 0xed28a6c7,
+    0xa7176d47, 0x551b60b9, 0x46e3004a, 0xb4ef0db4, 0x6113c1ac, 0x931fcc52,
+    0x80e7aca1, 0x72eba15f, 0x2ef24260, 0xdcfe4f9e, 0xcf062f6d, 0x3d0a2293,
+    0xe8f6ee8b, 0x1afae375, 0x09028386, 0xfb0e8e78, 0x9d7d1486, 0x6f711978,
+    0x7c89798b, 0x8e857475, 0x5b79b86d, 0xa975b593, 0xba8dd560, 0x4881d89e,
+    0x14983ba1, 0xe694365f, 0xf56c56ac, 0x07605b52, 0xd29c974a, 0x20909ab4,
+    0x3368fa47, 0xc164f7b9, 0x8b5b3c39, 0x795731c7, 0x6aaf5134, 0x98a35cca,
+    0x4d5f90d2, 0xbf539d2c, 0xacabfddf, 0x5ea7f021, 0x02be131e, 0xf0b21ee0,
+    0xe34a7e13, 0x114673ed, 0xc4babff5, 0x36b6b20b, 0x254ed2f8, 0xd742df06,
+    0xe9a9e704, 0x1ba5eafa, 0x085d8a09, 0xfa5187f7, 0x2fad4bef, 0xdda14611,
+    0xce5926e2, 0x3c552b1c, 0x604cc823, 0x9240c5dd, 0x81b8a52e, 0x73b4a8d0,
+    0xa64864c8, 0x54446936, 0x47bc09c5, 0xb5b0043b, 0xff8fcfbb, 0x0d83c245,
+    0x1e7ba2b6, 0xec77af48, 0x398b6350, 0xcb876eae, 0xd87f0e5d, 0x2a7303a3,
+    0x766ae09c, 0x8466ed62, 0x979e8d91, 0x6592806f, 0xb06e4c77, 0x42624189,
+    0x519a217a, 0xa3962c84, 0xc5e5b67a, 0x37e9bb84, 0x2411db77, 0xd61dd689,
+    0x03e11a91, 0xf1ed176f, 0xe215779c, 0x10197a62, 0x4c00995d, 0xbe0c94a3,
+    0xadf4f450, 0x5ff8f9ae, 0x8a0435b6, 0x78083848, 0x6bf058bb, 0x99fc5545,
+    0xd3c39ec5, 0x21cf933b, 0x3237f3c8, 0xc03bfe36, 0x15c7322e, 0xe7cb3fd0,
+    0xf4335f23, 0x063f52dd, 0x5a26b1e2, 0xa82abc1c, 0xbbd2dcef, 0x49ded111,
+    0x9c221d09, 0x6e2e10f7, 0x7dd67004, 0x8fda7dfa};
+
+constexpr const ptrdiff_t kPrefetchHorizon = 256;
+
+}  // namespace
+
+namespace crc32c {
+
+uint32_t ExtendPortable(uint32_t crc, const uint8_t* data, size_t size) {
+  const uint8_t* p = data;
+  const uint8_t* e = p + size;
+  uint32_t l = crc ^ kCRC32Xor;
+
+// Process one byte at a time.
+#define STEP1                              \
+  do {                                     \
+    int c = (l & 0xff) ^ *p++;             \
+    l = kByteExtensionTable[c] ^ (l >> 8); \
+  } while (0)
+
+// Process one of the 4 strides of 4-byte data.
+#define STEP4(s)                                                               \
+  do {                                                                         \
+    crc##s = ReadUint32LE(p + s * 4) ^ kStrideExtensionTable3[crc##s & 0xff] ^ \
+             kStrideExtensionTable2[(crc##s >> 8) & 0xff] ^                    \
+             kStrideExtensionTable1[(crc##s >> 16) & 0xff] ^                   \
+             kStrideExtensionTable0[crc##s >> 24];                             \
+  } while (0)
+
+// Process a 16-byte swath of 4 strides, each of which has 4 bytes of data.
+#define STEP16 \
+  do {         \
+    STEP4(0);  \
+    STEP4(1);  \
+    STEP4(2);  \
+    STEP4(3);  \
+    p += 16;   \
+  } while (0)
+
+// Process 4 bytes that were already loaded into a word.
+#define STEP4W(w)                                   \
+  do {                                              \
+    w ^= l;                                         \
+    for (size_t i = 0; i < 4; ++i) {                \
+      w = (w >> 8) ^ kByteExtensionTable[w & 0xff]; \
+    }                                               \
+    l = w;                                          \
+  } while (0)
+
+  // Point x at first 4-byte aligned byte in the buffer. This might be past the
+  // end of the buffer.
+  const uint8_t* x = RoundUp<4>(p);
+  if (x <= e) {
+    // Process bytes p is 4-byte aligned.
+    while (p != x) {
+      STEP1;
+    }
+  }
+
+  if ((e - p) >= 16) {
+    // Load a 16-byte swath into the stride partial results.
+    uint32_t crc0 = ReadUint32LE(p + 0 * 4) ^ l;
+    uint32_t crc1 = ReadUint32LE(p + 1 * 4);
+    uint32_t crc2 = ReadUint32LE(p + 2 * 4);
+    uint32_t crc3 = ReadUint32LE(p + 3 * 4);
+    p += 16;
+
+    while ((e - p) > kPrefetchHorizon) {
+      RequestPrefetch(p + kPrefetchHorizon);
+
+      // Process 64 bytes at a time.
+      STEP16;
+      STEP16;
+      STEP16;
+      STEP16;
+    }
+
+    // Process one 16-byte swath at a time.
+    while ((e - p) >= 16) {
+      STEP16;
+    }
+
+    // Advance one word at a time as far as possible.
+    while ((e - p) >= 4) {
+      STEP4(0);
+      uint32_t tmp = crc0;
+      crc0 = crc1;
+      crc1 = crc2;
+      crc2 = crc3;
+      crc3 = tmp;
+      p += 4;
+    }
+
+    // Combine the 4 partial stride results.
+    l = 0;
+    STEP4W(crc0);
+    STEP4W(crc1);
+    STEP4W(crc2);
+    STEP4W(crc3);
+  }
+
+  // Process the last few bytes.
+  while (p != e) {
+    STEP1;
+  }
+#undef STEP4W
+#undef STEP16
+#undef STEP4
+#undef STEP1
+  return l ^ kCRC32Xor;
+}
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_portable_unittest.cc b/third_party/crc32c/src/src/crc32c_portable_unittest.cc
new file mode 100644
index 0000000000..5098e2c373
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_portable_unittest.cc
@@ -0,0 +1,20 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "gtest/gtest.h"
+
+#include "./crc32c_extend_unittests.h"
+#include "./crc32c_internal.h"
+
+namespace crc32c {
+
+struct PortableTestTraits {
+  static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) {
+    return ExtendPortable(crc, data, count);
+  }
+};
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Portable, ExtendTest, PortableTestTraits);
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_prefetch.h b/third_party/crc32c/src/src/crc32c_prefetch.h
new file mode 100644
index 0000000000..e8df540494
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_prefetch.h
@@ -0,0 +1,44 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_PREFETCH_H_
+#define CRC32C_CRC32C_PREFETCH_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_MM_PREFETCH
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else  // !defined(_MSC_VER)
+#include <xmmintrin.h>
+#endif  // defined(_MSC_VER)
+
+#endif  // HAVE_MM_PREFETCH
+
+namespace crc32c {
+
+// Ask the hardware to prefetch the data at the given address into the L1 cache.
+inline void RequestPrefetch(const uint8_t* address) {
+#if HAVE_BUILTIN_PREFETCH
+  // Clang and GCC implement the __builtin_prefetch non-standard extension,
+  // which maps to the best instruction on the target architecture.
+  __builtin_prefetch(reinterpret_cast<const char*>(address), 0 /* Read only. */,
+                     0 /* No temporal locality. */);
+#elif HAVE_MM_PREFETCH
+  // Visual Studio doesn't implement __builtin_prefetch, but exposes the
+  // PREFETCHNTA instruction via the _mm_prefetch intrinsic.
+  _mm_prefetch(reinterpret_cast<const char*>(address), _MM_HINT_NTA);
+#else
+  // No prefetch support. Silence compiler warnings.
+  (void)address;
+#endif  // HAVE_BUILTIN_PREFETCH
+}
+
+}  // namespace crc32c
+
+#endif  // CRC32C_CRC32C_ROUND_UP_H_
diff --git a/third_party/crc32c/src/src/crc32c_prefetch_unittest.cc b/third_party/crc32c/src/src/crc32c_prefetch_unittest.cc
new file mode 100644
index 0000000000..b34ed2d5fe
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_prefetch_unittest.cc
@@ -0,0 +1,9 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_prefetch.h"
+
+// There is no easy way to test cache prefetching. We can only test that the
+// crc32c_prefetch.h header compiles on its own, so it doesn't have any unstated
+// dependencies.
diff --git a/third_party/crc32c/src/src/crc32c_read_le.h b/third_party/crc32c/src/src/crc32c_read_le.h
new file mode 100644
index 0000000000..fe455623c2
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_read_le.h
@@ -0,0 +1,51 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_READ_LE_H_
+#define CRC32C_CRC32C_READ_LE_H_
+
+#include <cstdint>
+#include <cstring>
+
+#include "crc32c/crc32c_config.h"
+
+namespace crc32c {
+
+// Reads a little-endian 32-bit integer from a 32-bit-aligned buffer.
+inline uint32_t ReadUint32LE(const uint8_t* buffer) {
+#if BYTE_ORDER_BIG_ENDIAN
+  return ((static_cast<uint32_t>(static_cast<uint8_t>(buffer[0]))) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[1])) << 8) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[2])) << 16) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[3])) << 24));
+#else   // !BYTE_ORDER_BIG_ENDIAN
+  uint32_t result;
+  // This should be optimized to a single instruction.
+  std::memcpy(&result, buffer, sizeof(result));
+  return result;
+#endif  // BYTE_ORDER_BIG_ENDIAN
+}
+
+// Reads a little-endian 64-bit integer from a 64-bit-aligned buffer.
+inline uint64_t ReadUint64LE(const uint8_t* buffer) {
+#if BYTE_ORDER_BIG_ENDIAN
+  return ((static_cast<uint32_t>(static_cast<uint8_t>(buffer[0]))) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[1])) << 8) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[2])) << 16) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[3])) << 24) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[4])) << 32) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[5])) << 40) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[6])) << 48) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[7])) << 56));
+#else   // !BYTE_ORDER_BIG_ENDIAN
+  uint64_t result;
+  // This should be optimized to a single instruction.
+  std::memcpy(&result, buffer, sizeof(result));
+  return result;
+#endif  // BYTE_ORDER_BIG_ENDIAN
+}
+
+}  // namespace crc32c
+
+#endif  // CRC32C_CRC32C_READ_LE_H_
diff --git a/third_party/crc32c/src/src/crc32c_read_le_unittest.cc b/third_party/crc32c/src/src/crc32c_read_le_unittest.cc
new file mode 100644
index 0000000000..2a30302adf
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_read_le_unittest.cc
@@ -0,0 +1,32 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_read_le.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+#include "./crc32c_round_up.h"
+
+namespace crc32c {
+
+TEST(Crc32CReadLETest, ReadUint32LE) {
+  // little-endian 0x12345678
+  alignas(4) uint8_t bytes[] = {0x78, 0x56, 0x34, 0x12};
+
+  ASSERT_EQ(RoundUp<4>(bytes), bytes) << "Stack array is not aligned";
+  EXPECT_EQ(static_cast<uint32_t>(0x12345678), ReadUint32LE(bytes));
+}
+
+TEST(Crc32CReadLETest, ReadUint64LE) {
+  // little-endian 0x123456789ABCDEF0
+  alignas(8) uint8_t bytes[] = {0xF0, 0xDE, 0xBC, 0x9A, 0x78, 0x56, 0x34, 0x12};
+
+  ASSERT_EQ(RoundUp<8>(bytes), bytes) << "Stack array is not aligned";
+  EXPECT_EQ(static_cast<uint64_t>(0x123456789ABCDEF0), ReadUint64LE(bytes));
+}
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_round_up.h b/third_party/crc32c/src/src/crc32c_round_up.h
new file mode 100644
index 0000000000..d3b922beb9
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_round_up.h
@@ -0,0 +1,34 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_ROUND_UP_H_
+#define CRC32C_CRC32C_ROUND_UP_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace crc32c {
+
+// Returns the smallest number >= the given number that is evenly divided by N.
+//
+// N must be a power of two.
+template <int N>
+constexpr inline uintptr_t RoundUp(uintptr_t pointer) {
+  static_assert((N & (N - 1)) == 0, "N must be a power of two");
+  return (pointer + (N - 1)) & ~(N - 1);
+}
+
+// Returns the smallest address >= the given address that is aligned to N bytes.
+//
+// N must be a power of two.
+template <int N>
+constexpr inline const uint8_t* RoundUp(const uint8_t* pointer) {
+  static_assert((N & (N - 1)) == 0, "N must be a power of two");
+  return reinterpret_cast<uint8_t*>(
+      RoundUp<N>(reinterpret_cast<uintptr_t>(pointer)));
+}
+
+}  // namespace crc32c
+
+#endif  // CRC32C_CRC32C_ROUND_UP_H_
diff --git a/third_party/crc32c/src/src/crc32c_round_up_unittest.cc b/third_party/crc32c/src/src/crc32c_round_up_unittest.cc
new file mode 100644
index 0000000000..5ff657bb5c
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_round_up_unittest.cc
@@ -0,0 +1,84 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_round_up.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+namespace crc32c {
+
+TEST(CRC32CRoundUpTest, RoundUpUintptr) {
+  uintptr_t zero = 0;
+
+  ASSERT_EQ(zero, RoundUp<1>(zero));
+  ASSERT_EQ(1U, RoundUp<1>(1U));
+  ASSERT_EQ(2U, RoundUp<1>(2U));
+  ASSERT_EQ(3U, RoundUp<1>(3U));
+  ASSERT_EQ(~static_cast<uintptr_t>(0), RoundUp<1>(~static_cast<uintptr_t>(0)));
+  ASSERT_EQ(~static_cast<uintptr_t>(1), RoundUp<1>(~static_cast<uintptr_t>(1)));
+  ASSERT_EQ(~static_cast<uintptr_t>(2), RoundUp<1>(~static_cast<uintptr_t>(2)));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<1>(~static_cast<uintptr_t>(3)));
+
+  ASSERT_EQ(zero, RoundUp<2>(zero));
+  ASSERT_EQ(2U, RoundUp<2>(1U));
+  ASSERT_EQ(2U, RoundUp<2>(2U));
+  ASSERT_EQ(4U, RoundUp<2>(3U));
+  ASSERT_EQ(4U, RoundUp<2>(4U));
+  ASSERT_EQ(6U, RoundUp<2>(5U));
+  ASSERT_EQ(6U, RoundUp<2>(6U));
+  ASSERT_EQ(8U, RoundUp<2>(7U));
+  ASSERT_EQ(8U, RoundUp<2>(8U));
+  ASSERT_EQ(~static_cast<uintptr_t>(1), RoundUp<2>(~static_cast<uintptr_t>(1)));
+  ASSERT_EQ(~static_cast<uintptr_t>(1), RoundUp<2>(~static_cast<uintptr_t>(2)));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<2>(~static_cast<uintptr_t>(3)));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<2>(~static_cast<uintptr_t>(4)));
+
+  ASSERT_EQ(zero, RoundUp<4>(zero));
+  ASSERT_EQ(4U, RoundUp<4>(1U));
+  ASSERT_EQ(4U, RoundUp<4>(2U));
+  ASSERT_EQ(4U, RoundUp<4>(3U));
+  ASSERT_EQ(4U, RoundUp<4>(4U));
+  ASSERT_EQ(8U, RoundUp<4>(5U));
+  ASSERT_EQ(8U, RoundUp<4>(6U));
+  ASSERT_EQ(8U, RoundUp<4>(7U));
+  ASSERT_EQ(8U, RoundUp<4>(8U));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(3)));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(4)));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(5)));
+  ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(6)));
+  ASSERT_EQ(~static_cast<uintptr_t>(7), RoundUp<4>(~static_cast<uintptr_t>(7)));
+  ASSERT_EQ(~static_cast<uintptr_t>(7), RoundUp<4>(~static_cast<uintptr_t>(8)));
+  ASSERT_EQ(~static_cast<uintptr_t>(7), RoundUp<4>(~static_cast<uintptr_t>(9)));
+}
+
+TEST(CRC32CRoundUpTest, RoundUpPointer) {
+  uintptr_t zero = 0, three = 3, four = 4, seven = 7, eight = 8;
+
+  const uint8_t* zero_ptr = reinterpret_cast<const uint8_t*>(zero);
+  const uint8_t* three_ptr = reinterpret_cast<const uint8_t*>(three);
+  const uint8_t* four_ptr = reinterpret_cast<const uint8_t*>(four);
+  const uint8_t* seven_ptr = reinterpret_cast<const uint8_t*>(seven);
+  const uint8_t* eight_ptr = reinterpret_cast<uint8_t*>(eight);
+
+  ASSERT_EQ(zero_ptr, RoundUp<1>(zero_ptr));
+  ASSERT_EQ(zero_ptr, RoundUp<4>(zero_ptr));
+  ASSERT_EQ(zero_ptr, RoundUp<8>(zero_ptr));
+
+  ASSERT_EQ(three_ptr, RoundUp<1>(three_ptr));
+  ASSERT_EQ(four_ptr, RoundUp<4>(three_ptr));
+  ASSERT_EQ(eight_ptr, RoundUp<8>(three_ptr));
+
+  ASSERT_EQ(four_ptr, RoundUp<1>(four_ptr));
+  ASSERT_EQ(four_ptr, RoundUp<4>(four_ptr));
+  ASSERT_EQ(eight_ptr, RoundUp<8>(four_ptr));
+
+  ASSERT_EQ(seven_ptr, RoundUp<1>(seven_ptr));
+  ASSERT_EQ(eight_ptr, RoundUp<4>(seven_ptr));
+  ASSERT_EQ(eight_ptr, RoundUp<8>(four_ptr));
+}
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_sse42.cc b/third_party/crc32c/src/src/crc32c_sse42.cc
new file mode 100644
index 0000000000..fc0cb0725f
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_sse42.cc
@@ -0,0 +1,256 @@
+// Copyright 2008 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_sse42.h"
+
+// In a separate source file to allow this accelerated CRC32C function to be
+// compiled with the appropriate compiler flags to enable SSE4.2 instructions.
+
+// This implementation is loosely based on Intel Pub 323405 from April 2011,
+// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction".
+
+#include <cstddef>
+#include <cstdint>
+
+#include "./crc32c_internal.h"
+#include "./crc32c_prefetch.h"
+#include "./crc32c_read_le.h"
+#include "./crc32c_round_up.h"
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else  // !defined(_MSC_VER)
+#include <nmmintrin.h>
+#endif  // defined(_MSC_VER)
+
+namespace crc32c {
+
+namespace {
+
+constexpr const ptrdiff_t kGroups = 3;
+constexpr const ptrdiff_t kBlock0Size = 16 * 1024 / kGroups / 64 * 64;
+constexpr const ptrdiff_t kBlock1Size = 4 * 1024 / kGroups / 8 * 8;
+constexpr const ptrdiff_t kBlock2Size = 1024 / kGroups / 8 * 8;
+
+const uint32_t kBlock0SkipTable[8][16] = {
+    {0x00000000, 0xff770459, 0xfb027e43, 0x04757a1a, 0xf3e88a77, 0x0c9f8e2e,
+     0x08eaf434, 0xf79df06d, 0xe23d621f, 0x1d4a6646, 0x193f1c5c, 0xe6481805,
+     0x11d5e868, 0xeea2ec31, 0xead7962b, 0x15a09272},
+    {0x00000000, 0xc196b2cf, 0x86c1136f, 0x4757a1a0, 0x086e502f, 0xc9f8e2e0,
+     0x8eaf4340, 0x4f39f18f, 0x10dca05e, 0xd14a1291, 0x961db331, 0x578b01fe,
+     0x18b2f071, 0xd92442be, 0x9e73e31e, 0x5fe551d1},
+    {0x00000000, 0x21b940bc, 0x43728178, 0x62cbc1c4, 0x86e502f0, 0xa75c424c,
+     0xc5978388, 0xe42ec334, 0x08267311, 0x299f33ad, 0x4b54f269, 0x6aedb2d5,
+     0x8ec371e1, 0xaf7a315d, 0xcdb1f099, 0xec08b025},
+    {0x00000000, 0x104ce622, 0x2099cc44, 0x30d52a66, 0x41339888, 0x517f7eaa,
+     0x61aa54cc, 0x71e6b2ee, 0x82673110, 0x922bd732, 0xa2fefd54, 0xb2b21b76,
+     0xc354a998, 0xd3184fba, 0xe3cd65dc, 0xf38183fe},
+    {0x00000000, 0x012214d1, 0x024429a2, 0x03663d73, 0x04885344, 0x05aa4795,
+     0x06cc7ae6, 0x07ee6e37, 0x0910a688, 0x0832b259, 0x0b548f2a, 0x0a769bfb,
+     0x0d98f5cc, 0x0cbae11d, 0x0fdcdc6e, 0x0efec8bf},
+    {0x00000000, 0x12214d10, 0x24429a20, 0x3663d730, 0x48853440, 0x5aa47950,
+     0x6cc7ae60, 0x7ee6e370, 0x910a6880, 0x832b2590, 0xb548f2a0, 0xa769bfb0,
+     0xd98f5cc0, 0xcbae11d0, 0xfdcdc6e0, 0xefec8bf0},
+    {0x00000000, 0x27f8a7f1, 0x4ff14fe2, 0x6809e813, 0x9fe29fc4, 0xb81a3835,
+     0xd013d026, 0xf7eb77d7, 0x3a294979, 0x1dd1ee88, 0x75d8069b, 0x5220a16a,
+     0xa5cbd6bd, 0x8233714c, 0xea3a995f, 0xcdc23eae},
+    {0x00000000, 0x745292f2, 0xe8a525e4, 0x9cf7b716, 0xd4a63d39, 0xa0f4afcb,
+     0x3c0318dd, 0x48518a2f, 0xaca00c83, 0xd8f29e71, 0x44052967, 0x3057bb95,
+     0x780631ba, 0x0c54a348, 0x90a3145e, 0xe4f186ac},
+};
+const uint32_t kBlock1SkipTable[8][16] = {
+    {0x00000000, 0x79113270, 0xf22264e0, 0x8b335690, 0xe1a8bf31, 0x98b98d41,
+     0x138adbd1, 0x6a9be9a1, 0xc6bd0893, 0xbfac3ae3, 0x349f6c73, 0x4d8e5e03,
+     0x2715b7a2, 0x5e0485d2, 0xd537d342, 0xac26e132},
+    {0x00000000, 0x889667d7, 0x14c0b95f, 0x9c56de88, 0x298172be, 0xa1171569,
+     0x3d41cbe1, 0xb5d7ac36, 0x5302e57c, 0xdb9482ab, 0x47c25c23, 0xcf543bf4,
+     0x7a8397c2, 0xf215f015, 0x6e432e9d, 0xe6d5494a},
+    {0x00000000, 0xa605caf8, 0x49e7e301, 0xefe229f9, 0x93cfc602, 0x35ca0cfa,
+     0xda282503, 0x7c2deffb, 0x2273faf5, 0x8476300d, 0x6b9419f4, 0xcd91d30c,
+     0xb1bc3cf7, 0x17b9f60f, 0xf85bdff6, 0x5e5e150e},
+    {0x00000000, 0x44e7f5ea, 0x89cfebd4, 0xcd281e3e, 0x1673a159, 0x529454b3,
+     0x9fbc4a8d, 0xdb5bbf67, 0x2ce742b2, 0x6800b758, 0xa528a966, 0xe1cf5c8c,
+     0x3a94e3eb, 0x7e731601, 0xb35b083f, 0xf7bcfdd5},
+    {0x00000000, 0x59ce8564, 0xb39d0ac8, 0xea538fac, 0x62d66361, 0x3b18e605,
+     0xd14b69a9, 0x8885eccd, 0xc5acc6c2, 0x9c6243a6, 0x7631cc0a, 0x2fff496e,
+     0xa77aa5a3, 0xfeb420c7, 0x14e7af6b, 0x4d292a0f},
+    {0x00000000, 0x8eb5fb75, 0x1887801b, 0x96327b6e, 0x310f0036, 0xbfbafb43,
+     0x2988802d, 0xa73d7b58, 0x621e006c, 0xecabfb19, 0x7a998077, 0xf42c7b02,
+     0x5311005a, 0xdda4fb2f, 0x4b968041, 0xc5237b34},
+    {0x00000000, 0xc43c00d8, 0x8d947741, 0x49a87799, 0x1ec49873, 0xdaf898ab,
+     0x9350ef32, 0x576cefea, 0x3d8930e6, 0xf9b5303e, 0xb01d47a7, 0x7421477f,
+     0x234da895, 0xe771a84d, 0xaed9dfd4, 0x6ae5df0c},
+    {0x00000000, 0x7b1261cc, 0xf624c398, 0x8d36a254, 0xe9a5f1c1, 0x92b7900d,
+     0x1f813259, 0x64935395, 0xd6a79573, 0xadb5f4bf, 0x208356eb, 0x5b913727,
+     0x3f0264b2, 0x4410057e, 0xc926a72a, 0xb234c6e6},
+};
+const uint32_t kBlock2SkipTable[8][16] = {
+    {0x00000000, 0x8f158014, 0x1bc776d9, 0x94d2f6cd, 0x378eedb2, 0xb89b6da6,
+     0x2c499b6b, 0xa35c1b7f, 0x6f1ddb64, 0xe0085b70, 0x74daadbd, 0xfbcf2da9,
+     0x589336d6, 0xd786b6c2, 0x4354400f, 0xcc41c01b},
+    {0x00000000, 0xde3bb6c8, 0xb99b1b61, 0x67a0ada9, 0x76da4033, 0xa8e1f6fb,
+     0xcf415b52, 0x117aed9a, 0xedb48066, 0x338f36ae, 0x542f9b07, 0x8a142dcf,
+     0x9b6ec055, 0x4555769d, 0x22f5db34, 0xfcce6dfc},
+    {0x00000000, 0xde85763d, 0xb8e69a8b, 0x6663ecb6, 0x742143e7, 0xaaa435da,
+     0xccc7d96c, 0x1242af51, 0xe84287ce, 0x36c7f1f3, 0x50a41d45, 0x8e216b78,
+     0x9c63c429, 0x42e6b214, 0x24855ea2, 0xfa00289f},
+    {0x00000000, 0xd569796d, 0xaf3e842b, 0x7a57fd46, 0x5b917ea7, 0x8ef807ca,
+     0xf4affa8c, 0x21c683e1, 0xb722fd4e, 0x624b8423, 0x181c7965, 0xcd750008,
+     0xecb383e9, 0x39dafa84, 0x438d07c2, 0x96e47eaf},
+    {0x00000000, 0x6ba98c6d, 0xd75318da, 0xbcfa94b7, 0xab4a4745, 0xc0e3cb28,
+     0x7c195f9f, 0x17b0d3f2, 0x5378f87b, 0x38d17416, 0x842be0a1, 0xef826ccc,
+     0xf832bf3e, 0x939b3353, 0x2f61a7e4, 0x44c82b89},
+    {0x00000000, 0xa6f1f0f6, 0x480f971d, 0xeefe67eb, 0x901f2e3a, 0x36eedecc,
+     0xd810b927, 0x7ee149d1, 0x25d22a85, 0x8323da73, 0x6dddbd98, 0xcb2c4d6e,
+     0xb5cd04bf, 0x133cf449, 0xfdc293a2, 0x5b336354},
+    {0x00000000, 0x4ba4550a, 0x9748aa14, 0xdcecff1e, 0x2b7d22d9, 0x60d977d3,
+     0xbc3588cd, 0xf791ddc7, 0x56fa45b2, 0x1d5e10b8, 0xc1b2efa6, 0x8a16baac,
+     0x7d87676b, 0x36233261, 0xeacfcd7f, 0xa16b9875},
+    {0x00000000, 0xadf48b64, 0x5e056039, 0xf3f1eb5d, 0xbc0ac072, 0x11fe4b16,
+     0xe20fa04b, 0x4ffb2b2f, 0x7df9f615, 0xd00d7d71, 0x23fc962c, 0x8e081d48,
+     0xc1f33667, 0x6c07bd03, 0x9ff6565e, 0x3202dd3a},
+};
+
+constexpr const ptrdiff_t kPrefetchHorizon = 256;
+
+}  // namespace
+
+uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t size) {
+  const uint8_t* p = data;
+  const uint8_t* e = data + size;
+  uint32_t l = crc ^ kCRC32Xor;
+
+#define STEP1                  \
+  do {                         \
+    l = _mm_crc32_u8(l, *p++); \
+  } while (0)
+
+#define STEP4(crc)                             \
+  do {                                         \
+    crc = _mm_crc32_u32(crc, ReadUint32LE(p)); \
+    p += 4;                                    \
+  } while (0)
+
+#define STEP8(crc, data)                          \
+  do {                                            \
+    crc = _mm_crc32_u64(crc, ReadUint64LE(data)); \
+    data += 8;                                    \
+  } while (0)
+
+#define STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
+  do {                                         \
+    STEP8(crc0, p0);                           \
+    STEP8(crc1, p1);                           \
+    STEP8(crc2, p2);                           \
+  } while (0)
+
+#define STEP8X3(crc0, crc1, crc2, bs)                     \
+  do {                                                    \
+    crc0 = _mm_crc32_u64(crc0, ReadUint64LE(p));          \
+    crc1 = _mm_crc32_u64(crc1, ReadUint64LE(p + bs));     \
+    crc2 = _mm_crc32_u64(crc2, ReadUint64LE(p + 2 * bs)); \
+    p += 8;                                               \
+  } while (0)
+
+#define SKIP_BLOCK(crc, tab)                                      \
+  do {                                                            \
+    crc = tab[0][crc & 0xf] ^ tab[1][(crc >> 4) & 0xf] ^          \
+          tab[2][(crc >> 8) & 0xf] ^ tab[3][(crc >> 12) & 0xf] ^  \
+          tab[4][(crc >> 16) & 0xf] ^ tab[5][(crc >> 20) & 0xf] ^ \
+          tab[6][(crc >> 24) & 0xf] ^ tab[7][(crc >> 28) & 0xf];  \
+  } while (0)
+
+  // Point x at first 8-byte aligned byte in the buffer. This might be past the
+  // end of the buffer.
+  const uint8_t* x = RoundUp<8>(p);
+  if (x <= e) {
+    // Process bytes p is 8-byte aligned.
+    while (p != x) {
+      STEP1;
+    }
+  }
+
+  // Proccess the data in predetermined block sizes with tables for quickly
+  // combining the checksum. Experimentally it's better to use larger block
+  // sizes where possible so use a hierarchy of decreasing block sizes.
+  uint64_t l64 = l;
+  while ((e - p) >= kGroups * kBlock0Size) {
+    uint64_t l641 = 0;
+    uint64_t l642 = 0;
+    for (int i = 0; i < kBlock0Size; i += 8 * 8) {
+      // Prefetch ahead to hide latency.
+      RequestPrefetch(p + kPrefetchHorizon);
+      RequestPrefetch(p + kBlock0Size + kPrefetchHorizon);
+      RequestPrefetch(p + 2 * kBlock0Size + kPrefetchHorizon);
+
+      // Process 64 bytes at a time.
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+      STEP8X3(l64, l641, l642, kBlock0Size);
+    }
+
+    // Combine results.
+    SKIP_BLOCK(l64, kBlock0SkipTable);
+    l64 ^= l641;
+    SKIP_BLOCK(l64, kBlock0SkipTable);
+    l64 ^= l642;
+    p += (kGroups - 1) * kBlock0Size;
+  }
+  while ((e - p) >= kGroups * kBlock1Size) {
+    uint64_t l641 = 0;
+    uint64_t l642 = 0;
+    for (int i = 0; i < kBlock1Size; i += 8) {
+      STEP8X3(l64, l641, l642, kBlock1Size);
+    }
+    SKIP_BLOCK(l64, kBlock1SkipTable);
+    l64 ^= l641;
+    SKIP_BLOCK(l64, kBlock1SkipTable);
+    l64 ^= l642;
+    p += (kGroups - 1) * kBlock1Size;
+  }
+  while ((e - p) >= kGroups * kBlock2Size) {
+    uint64_t l641 = 0;
+    uint64_t l642 = 0;
+    for (int i = 0; i < kBlock2Size; i += 8) {
+      STEP8X3(l64, l641, l642, kBlock2Size);
+    }
+    SKIP_BLOCK(l64, kBlock2SkipTable);
+    l64 ^= l641;
+    SKIP_BLOCK(l64, kBlock2SkipTable);
+    l64 ^= l642;
+    p += (kGroups - 1) * kBlock2Size;
+  }
+
+  // Process bytes 16 at a time
+  while ((e - p) >= 16) {
+    STEP8(l64, p);
+    STEP8(l64, p);
+  }
+
+  l = static_cast<uint32_t>(l64);
+  // Process the last few bytes.
+  while (p != e) {
+    STEP1;
+  }
+#undef SKIP_BLOCK
+#undef STEP8X3
+#undef STEP8BY3
+#undef STEP8
+#undef STEP4
+#undef STEP1
+
+  return l ^ kCRC32Xor;
+}
+
+}  // namespace crc32c
+
+#endif  // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
diff --git a/third_party/crc32c/src/src/crc32c_sse42.h b/third_party/crc32c/src/src/crc32c_sse42.h
new file mode 100644
index 0000000000..b9ed179e54
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_sse42.h
@@ -0,0 +1,31 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_SSE42_H_
+#define CRC32C_CRC32C_SSE42_H_
+
+// X86-specific code.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "crc32c/crc32c_config.h"
+
+// The hardware-accelerated implementation is only enabled for 64-bit builds,
+// because a straightforward 32-bit implementation actually runs slower than the
+// portable version. Most X86 machines are 64-bit nowadays, so it doesn't make
+// much sense to spend time building an optimized hardware-accelerated
+// implementation.
+#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+namespace crc32c {
+
+// SSE4.2-accelerated implementation in crc32c_sse42.cc
+uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t count);
+
+}  // namespace crc32c
+
+#endif  // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+#endif  // CRC32C_CRC32C_SSE42_H_
diff --git a/third_party/crc32c/src/src/crc32c_sse42_check.h b/third_party/crc32c/src/src/crc32c_sse42_check.h
new file mode 100644
index 0000000000..ad380dd20e
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_sse42_check.h
@@ -0,0 +1,48 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef CRC32C_CRC32C_SSE42_CHECK_H_
+#define CRC32C_CRC32C_SSE42_CHECK_H_
+
+// X86-specific code checking the availability of SSE4.2 instructions.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+// If the compiler supports SSE4.2, it definitely supports X86.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+
+namespace crc32c {
+
+inline bool CanUseSse42() {
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  return (cpu_info[2] & (1 << 20)) != 0;
+}
+
+}  // namespace crc32c
+
+#else  // !defined(_MSC_VER)
+#include <cpuid.h>
+
+namespace crc32c {
+
+inline bool CanUseSse42() {
+  unsigned int eax, ebx, ecx, edx;
+  return __get_cpuid(1, &eax, &ebx, &ecx, &edx) && ((ecx & (1 << 20)) != 0);
+}
+
+}  // namespace crc32c
+
+#endif  // defined(_MSC_VER)
+
+#endif  // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+#endif  // CRC32C_CRC32C_SSE42_CHECK_H_
diff --git a/third_party/crc32c/src/src/crc32c_sse42_unittest.cc b/third_party/crc32c/src/src/crc32c_sse42_unittest.cc
new file mode 100644
index 0000000000..c73ad8ddd1
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_sse42_unittest.cc
@@ -0,0 +1,24 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "gtest/gtest.h"
+
+#include "./crc32c_extend_unittests.h"
+#include "./crc32c_sse42.h"
+
+namespace crc32c {
+
+#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+struct Sse42TestTraits {
+  static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) {
+    return ExtendSse42(crc, data, count);
+  }
+};
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sse42, ExtendTest, Sse42TestTraits);
+
+#endif  // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
+
+}  // namespace crc32c
diff --git a/third_party/crc32c/src/src/crc32c_test_main.cc b/third_party/crc32c/src/src/crc32c_test_main.cc
new file mode 100644
index 0000000000..c07e1c8bc4
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_test_main.cc
@@ -0,0 +1,20 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "crc32c/crc32c_config.h"
+
+#include "gtest/gtest.h"
+
+#if CRC32C_TESTS_BUILT_WITH_GLOG
+#include "glog/logging.h"
+#endif  // CRC32C_TESTS_BUILT_WITH_GLOG
+
+int main(int argc, char** argv) {
+#if CRC32C_TESTS_BUILT_WITH_GLOG
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+#endif  // CRC32C_TESTS_BUILT_WITH_GLOG
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/crc32c/src/src/crc32c_unittest.cc b/third_party/crc32c/src/src/crc32c_unittest.cc
new file mode 100644
index 0000000000..d6c6af680c
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_unittest.cc
@@ -0,0 +1,129 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "crc32c/crc32c.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "gtest/gtest.h"
+
+#include "./crc32c_extend_unittests.h"
+
+TEST(Crc32CTest, Crc32c) {
+  // From rfc3720 section B.4.
+  uint8_t buf[32];
+
+  std::memset(buf, 0, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  std::memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<uint8_t>(i);
+  EXPECT_EQ(static_cast<uint32_t>(0x46dd794e),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<uint8_t>(31 - i);
+  EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  uint8_t data[48] = {
+      0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+      0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+  EXPECT_EQ(static_cast<uint32_t>(0xd9963a56),
+            crc32c::Crc32c(data, sizeof(data)));
+}
+
+namespace crc32c {
+
+struct ApiTestTraits {
+  static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) {
+    return ::crc32c::Extend(crc, data, count);
+  }
+};
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Api, ExtendTest, ApiTestTraits);
+
+}  // namespace crc32c
+
+TEST(CRC32CTest, Crc32cCharPointer) {
+  char buf[32];
+
+  std::memset(buf, 0, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  std::memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<char>(i);
+  EXPECT_EQ(static_cast<uint32_t>(0x46dd794e),
+            crc32c::Crc32c(buf, sizeof(buf)));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<char>(31 - i);
+  EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c),
+            crc32c::Crc32c(buf, sizeof(buf)));
+}
+
+TEST(CRC32CTest, Crc32cStdString) {
+  std::string buf;
+  buf.resize(32);
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<char>(0x00);
+  EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), crc32c::Crc32c(buf));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = '\xff';
+  EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), crc32c::Crc32c(buf));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<char>(i);
+  EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), crc32c::Crc32c(buf));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<char>(31 - i);
+  EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), crc32c::Crc32c(buf));
+}
+
+#if __cplusplus > 201402L
+#if __has_include(<string_view>)
+
+TEST(CRC32CTest, Crc32cStdStringView) {
+  uint8_t buf[32];
+  std::string_view view(reinterpret_cast<const char*>(buf), sizeof(buf));
+
+  std::memset(buf, 0, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), crc32c::Crc32c(view));
+
+  std::memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), crc32c::Crc32c(view));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<uint8_t>(i);
+  EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), crc32c::Crc32c(view));
+
+  for (size_t i = 0; i < 32; ++i)
+    buf[i] = static_cast<uint8_t>(31 - i);
+  EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), crc32c::Crc32c(view));
+}
+
+#endif  // __has_include(<string_view>)
+#endif  // __cplusplus > 201402L
+
+#define TESTED_EXTEND Extend
+#include "./crc32c_extend_unittests.h"
+#undef TESTED_EXTEND
diff --git a/third_party/libaom/CMakeLists.txt b/third_party/libaom/CMakeLists.txt
index 2e84ff8350..55ac2e0bc7 100644
--- a/third_party/libaom/CMakeLists.txt
+++ b/third_party/libaom/CMakeLists.txt
@@ -91,22 +91,27 @@ elseif(LINUX_AARCH64)
     LICENSE
     "BSD-3-Clause"
     SRC
+    ${AOM_ROOT}/libaom/aom_ports/arm_cpudetect.c
     ${aom_av1_common_intrin_neon}
     ${aom_av1_common_sources}
     ${aom_av1_decoder_sources}
+    ${aom_av1_encoder_intrin_neon}
+    ${aom_av1_encoder_sources}
     ${aom_dsp_common_intrin_neon}
     ${aom_dsp_common_sources}
     ${aom_dsp_decoder_sources}
+    ${aom_dsp_encoder_intrin_neon}
+    ${aom_dsp_encoder_sources}
     ${aom_mem_sources}
-    ${AOM_ROOT}/libaom/aom_ports/arm_cpudetect.c
     ${aom_rtcd_sources}
     ${aom_scale_sources}
     ${aom_sources}
     ${aom_util_sources})
   target_include_directories(
     webrtc_libaom
-    PRIVATE ${AOM_ROOT}/config ${AOM_ROOT}/config/arm64
+    PRIVATE ${AOM_ROOT}/config ${AOM_ROOT}/config/linux/arm64
     PUBLIC ${AOM_ROOT}/libaom)
+# target_compile_options(webrtc_libaom PRIVATE "-mfpu=neon")
 else()
   message(FATAL_ERROR "This can only be used in linux builds")
 endif()
diff --git a/third_party/libaom/libaom_src.cmake b/third_party/libaom/libaom_src.cmake
index 0f60cfc0ea..b582a517af 100644
--- a/third_party/libaom/libaom_src.cmake
+++ b/third_party/libaom/libaom_src.cmake
@@ -288,6 +288,8 @@ set(aom_av1_encoder_sources
     
   "${AOM_ROOT}/libaom/av1/encoder/extend.c" 
     
+  "${AOM_ROOT}/libaom/av1/encoder/external_partition.c" 
+    
     
   "${AOM_ROOT}/libaom/av1/encoder/hash.c" 
     
@@ -609,6 +611,8 @@ set(aom_dsp_encoder_sources
   "${AOM_ROOT}/libaom/aom_dsp/sad.c" 
   "${AOM_ROOT}/libaom/aom_dsp/sad_av1.c" 
   "${AOM_ROOT}/libaom/aom_dsp/sse.c" 
+  "${AOM_ROOT}/libaom/aom_dsp/ssim.c" 
+    
   "${AOM_ROOT}/libaom/aom_dsp/sum_squares.c" 
   "${AOM_ROOT}/libaom/aom_dsp/variance.c" 
     
@@ -660,6 +664,7 @@ set(aom_sources
     
     
     
+    
   "${AOM_ROOT}/libaom/aom/src/aom_codec.c" 
   "${AOM_ROOT}/libaom/aom/src/aom_decoder.c" 
   "${AOM_ROOT}/libaom/aom/src/aom_encoder.c" 
diff --git a/third_party/libaom/source/config/config/aom_version.h b/third_party/libaom/source/config/config/aom_version.h
index 9c9e03e384..d62ceb34f7 100644
--- a/third_party/libaom/source/config/config/aom_version.h
+++ b/third_party/libaom/source/config/config/aom_version.h
@@ -9,11 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#define VERSION_MAJOR 2
-#define VERSION_MINOR 0
-#define VERSION_PATCH 2
-#define VERSION_EXTRA "1395-g79b775799"
+#define VERSION_MAJOR 3
+#define VERSION_MINOR 1
+#define VERSION_PATCH 0
+#define VERSION_EXTRA "309-g12287adee"
 #define VERSION_PACKED \
   ((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))
-#define VERSION_STRING_NOSP "2.0.2-1395-g79b775799"
-#define VERSION_STRING " 2.0.2-1395-g79b775799"
+#define VERSION_STRING_NOSP "3.1.0-309-g12287adee"
+#define VERSION_STRING " 3.1.0-309-g12287adee"
diff --git a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm
index dcceb2497b..15c20d956a 100644
--- a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm
+++ b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h
index 655ca4c8dc..f79ffc6929 100644
--- a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h
+++ b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h
index 027c19a09f..e71ec66a00 100644
--- a/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h
@@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a,
                      int height);
 #define aom_sse aom_sse_neon
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/ios/arm64/config/aom_config.asm b/third_party/libaom/source/config/ios/arm64/config/aom_config.asm
index dcceb2497b..15c20d956a 100644
--- a/third_party/libaom/source/config/ios/arm64/config/aom_config.asm
+++ b/third_party/libaom/source/config/ios/arm64/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/ios/arm64/config/aom_config.h b/third_party/libaom/source/config/ios/arm64/config/aom_config.h
index 655ca4c8dc..f79ffc6929 100644
--- a/third_party/libaom/source/config/ios/arm64/config/aom_config.h
+++ b/third_party/libaom/source/config/ios/arm64/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h
index 027c19a09f..e71ec66a00 100644
--- a/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h
@@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a,
                      int height);
 #define aom_sse aom_sse_neon
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm
index fbaae3b28d..ac5f20f3b1 100644
--- a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h
index adb548aa40..c8e44f4edd 100644
--- a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h
index 61141406d2..a4df74d40d 100644
--- a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h
@@ -3813,6 +3813,17 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a,
                                int width,
                                int height);
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm
index dcceb2497b..15c20d956a 100644
--- a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h
index 655ca4c8dc..f79ffc6929 100644
--- a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h
index 027c19a09f..e71ec66a00 100644
--- a/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h
@@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a,
                      int height);
 #define aom_sse aom_sse_neon
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/arm/config/aom_config.asm b/third_party/libaom/source/config/linux/arm/config/aom_config.asm
index e9000243ad..bc1a95f003 100644
--- a/third_party/libaom/source/config/linux/arm/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/arm/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/linux/arm/config/aom_config.h b/third_party/libaom/source/config/linux/arm/config/aom_config.h
index 0404a4c827..f3ac36f68c 100644
--- a/third_party/libaom/source/config/linux/arm/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/arm/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h
index d7b1b04f00..8710d625ed 100644
--- a/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h
@@ -2953,6 +2953,17 @@ int64_t aom_sse_c(const uint8_t* a,
                   int height);
 #define aom_sse aom_sse_c
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/arm64/config/aom_config.asm b/third_party/libaom/source/config/linux/arm64/config/aom_config.asm
index dcceb2497b..15c20d956a 100644
--- a/third_party/libaom/source/config/linux/arm64/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/arm64/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/linux/arm64/config/aom_config.h b/third_party/libaom/source/config/linux/arm64/config/aom_config.h
index 655ca4c8dc..f79ffc6929 100644
--- a/third_party/libaom/source/config/linux/arm64/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/arm64/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h
index 027c19a09f..e71ec66a00 100644
--- a/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h
@@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a,
                      int height);
 #define aom_sse aom_sse_neon
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/generic/config/aom_config.asm b/third_party/libaom/source/config/linux/generic/config/aom_config.asm
index 0e681a032e..24b965dfb3 100644
--- a/third_party/libaom/source/config/linux/generic/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/generic/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/linux/generic/config/aom_config.h b/third_party/libaom/source/config/linux/generic/config/aom_config.h
index 0e1665a47e..cdb4794210 100644
--- a/third_party/libaom/source/config/linux/generic/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/generic/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h
index 05bfa838bb..702c1b809e 100644
--- a/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h
@@ -2953,6 +2953,17 @@ int64_t aom_sse_c(const uint8_t* a,
                   int height);
 #define aom_sse aom_sse_c
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/ia32/config/aom_config.asm b/third_party/libaom/source/config/linux/ia32/config/aom_config.asm
index d8ec860317..f4e2dfb836 100644
--- a/third_party/libaom/source/config/linux/ia32/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/ia32/config/aom_config.asm
@@ -10,6 +10,7 @@
 %define CONFIG_AV1_HIGHBITDEPTH 0
 %define CONFIG_AV1_TEMPORAL_DENOISING 1
 %define CONFIG_BIG_ENDIAN 0
+%define CONFIG_BITRATE_ACCURACY 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -21,6 +22,7 @@
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+%define CONFIG_FRAME_PARALLEL_ENCODE 0
 %define CONFIG_GCC 1
 %define CONFIG_GCOV 0
 %define CONFIG_GPROF 0
@@ -36,6 +38,7 @@
 %define CONFIG_NORMAL_TILE_MODE 1
 %define CONFIG_OPTICAL_FLOW_API 0
 %define CONFIG_OS_SUPPORT 1
+%define CONFIG_PARTITION_SEARCH_ORDER 0
 %define CONFIG_PIC 1
 %define CONFIG_RD_DEBUG 0
 %define CONFIG_REALTIME_ONLY 1
@@ -48,7 +51,6 @@
 %define CONFIG_SPEED_STATS 0
 %define CONFIG_TUNE_BUTTERAUGLI 0
 %define CONFIG_TUNE_VMAF 0
-%define CONFIG_USE_VMAF_RC 0
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/ia32/config/aom_config.h b/third_party/libaom/source/config/linux/ia32/config/aom_config.h
index 53666caafa..1b3bba6797 100644
--- a/third_party/libaom/source/config/linux/ia32/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/ia32/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 1
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h
index 64bc1f4056..323c55e888 100644
--- a/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h
@@ -6787,6 +6787,17 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a,
                                int width,
                                int height);
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/linux/x64/config/aom_config.asm b/third_party/libaom/source/config/linux/x64/config/aom_config.asm
index 0fdb4ea1e8..b15994bbd7 100644
--- a/third_party/libaom/source/config/linux/x64/config/aom_config.asm
+++ b/third_party/libaom/source/config/linux/x64/config/aom_config.asm
@@ -10,6 +10,7 @@
 %define CONFIG_AV1_HIGHBITDEPTH 0
 %define CONFIG_AV1_TEMPORAL_DENOISING 1
 %define CONFIG_BIG_ENDIAN 0
+%define CONFIG_BITRATE_ACCURACY 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -21,6 +22,7 @@
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+%define CONFIG_FRAME_PARALLEL_ENCODE 0
 %define CONFIG_GCC 1
 %define CONFIG_GCOV 0
 %define CONFIG_GPROF 0
@@ -36,6 +38,7 @@
 %define CONFIG_NORMAL_TILE_MODE 1
 %define CONFIG_OPTICAL_FLOW_API 0
 %define CONFIG_OS_SUPPORT 1
+%define CONFIG_PARTITION_SEARCH_ORDER 0
 %define CONFIG_PIC 0
 %define CONFIG_RD_DEBUG 0
 %define CONFIG_REALTIME_ONLY 1
@@ -48,7 +51,6 @@
 %define CONFIG_SPEED_STATS 0
 %define CONFIG_TUNE_BUTTERAUGLI 0
 %define CONFIG_TUNE_VMAF 0
-%define CONFIG_USE_VMAF_RC 0
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/x64/config/aom_config.h b/third_party/libaom/source/config/linux/x64/config/aom_config.h
index d026bc215f..d090f8398a 100644
--- a/third_party/libaom/source/config/linux/x64/config/aom_config.h
+++ b/third_party/libaom/source/config/linux/x64/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h
index 58de231219..dd561e4498 100644
--- a/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h
@@ -6814,6 +6814,26 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a,
                                int width,
                                int height);
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+void aom_ssim_parms_8x8_sse2(const uint8_t* s,
+                             int sp,
+                             const uint8_t* r,
+                             int rp,
+                             uint32_t* sum_s,
+                             uint32_t* sum_r,
+                             uint32_t* sum_sq_s,
+                             uint32_t* sum_sq_r,
+                             uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/win/arm64/config/aom_config.asm b/third_party/libaom/source/config/win/arm64/config/aom_config.asm
index dcceb2497b..15c20d956a 100644
--- a/third_party/libaom/source/config/win/arm64/config/aom_config.asm
+++ b/third_party/libaom/source/config/win/arm64/config/aom_config.asm
@@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1
 CONFIG_AV1_HIGHBITDEPTH equ 0
 CONFIG_AV1_TEMPORAL_DENOISING equ 1
 CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITRATE_ACCURACY equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_COMPONENT_TIMING equ 0
@@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0
+CONFIG_FRAME_PARALLEL_ENCODE equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
@@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OPTICAL_FLOW_API equ 0
 CONFIG_OS_SUPPORT equ 1
+CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
 CONFIG_REALTIME_ONLY equ 1
@@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
 CONFIG_TUNE_BUTTERAUGLI equ 0
 CONFIG_TUNE_VMAF equ 0
-CONFIG_USE_VMAF_RC equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
diff --git a/third_party/libaom/source/config/win/arm64/config/aom_config.h b/third_party/libaom/source/config/win/arm64/config/aom_config.h
index 5ca170928b..c744a45ff4 100644
--- a/third_party/libaom/source/config/win/arm64/config/aom_config.h
+++ b/third_party/libaom/source/config/win/arm64/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 0
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h
index 027c19a09f..e71ec66a00 100644
--- a/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h
@@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a,
                      int height);
 #define aom_sse aom_sse_neon
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/win/ia32/config/aom_config.asm b/third_party/libaom/source/config/win/ia32/config/aom_config.asm
index 789f7c98f7..ad1912f54d 100644
--- a/third_party/libaom/source/config/win/ia32/config/aom_config.asm
+++ b/third_party/libaom/source/config/win/ia32/config/aom_config.asm
@@ -10,6 +10,7 @@
 %define CONFIG_AV1_HIGHBITDEPTH 0
 %define CONFIG_AV1_TEMPORAL_DENOISING 1
 %define CONFIG_BIG_ENDIAN 0
+%define CONFIG_BITRATE_ACCURACY 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -21,6 +22,7 @@
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+%define CONFIG_FRAME_PARALLEL_ENCODE 0
 %define CONFIG_GCC 0
 %define CONFIG_GCOV 0
 %define CONFIG_GPROF 0
@@ -36,6 +38,7 @@
 %define CONFIG_NORMAL_TILE_MODE 1
 %define CONFIG_OPTICAL_FLOW_API 0
 %define CONFIG_OS_SUPPORT 1
+%define CONFIG_PARTITION_SEARCH_ORDER 0
 %define CONFIG_PIC 1
 %define CONFIG_RD_DEBUG 0
 %define CONFIG_REALTIME_ONLY 1
@@ -48,7 +51,6 @@
 %define CONFIG_SPEED_STATS 0
 %define CONFIG_TUNE_BUTTERAUGLI 0
 %define CONFIG_TUNE_VMAF 0
-%define CONFIG_USE_VMAF_RC 0
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/win/ia32/config/aom_config.h b/third_party/libaom/source/config/win/ia32/config/aom_config.h
index e9cafd4296..044ba296e6 100644
--- a/third_party/libaom/source/config/win/ia32/config/aom_config.h
+++ b/third_party/libaom/source/config/win/ia32/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 0
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 1
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h
index 64bc1f4056..323c55e888 100644
--- a/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h
@@ -6787,6 +6787,17 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a,
                                int width,
                                int height);
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/config/win/x64/config/aom_config.asm b/third_party/libaom/source/config/win/x64/config/aom_config.asm
index bdebbbe6b3..f3e1660a08 100644
--- a/third_party/libaom/source/config/win/x64/config/aom_config.asm
+++ b/third_party/libaom/source/config/win/x64/config/aom_config.asm
@@ -10,6 +10,7 @@
 %define CONFIG_AV1_HIGHBITDEPTH 0
 %define CONFIG_AV1_TEMPORAL_DENOISING 1
 %define CONFIG_BIG_ENDIAN 0
+%define CONFIG_BITRATE_ACCURACY 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -21,6 +22,7 @@
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+%define CONFIG_FRAME_PARALLEL_ENCODE 0
 %define CONFIG_GCC 0
 %define CONFIG_GCOV 0
 %define CONFIG_GPROF 0
@@ -36,6 +38,7 @@
 %define CONFIG_NORMAL_TILE_MODE 1
 %define CONFIG_OPTICAL_FLOW_API 0
 %define CONFIG_OS_SUPPORT 1
+%define CONFIG_PARTITION_SEARCH_ORDER 0
 %define CONFIG_PIC 0
 %define CONFIG_RD_DEBUG 0
 %define CONFIG_REALTIME_ONLY 1
@@ -48,7 +51,6 @@
 %define CONFIG_SPEED_STATS 0
 %define CONFIG_TUNE_BUTTERAUGLI 0
 %define CONFIG_TUNE_VMAF 0
-%define CONFIG_USE_VMAF_RC 0
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/win/x64/config/aom_config.h b/third_party/libaom/source/config/win/x64/config/aom_config.h
index 6187935081..1adc7b7407 100644
--- a/third_party/libaom/source/config/win/x64/config/aom_config.h
+++ b/third_party/libaom/source/config/win/x64/config/aom_config.h
@@ -22,6 +22,7 @@
 #define CONFIG_AV1_HIGHBITDEPTH 0
 #define CONFIG_AV1_TEMPORAL_DENOISING 1
 #define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITRATE_ACCURACY 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_COLLECT_COMPONENT_TIMING 0
@@ -33,6 +34,7 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0
+#define CONFIG_FRAME_PARALLEL_ENCODE 0
 #define CONFIG_GCC 0
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
@@ -48,6 +50,7 @@
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OPTICAL_FLOW_API 0
 #define CONFIG_OS_SUPPORT 1
+#define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
 #define CONFIG_REALTIME_ONLY 1
@@ -60,7 +63,6 @@
 #define CONFIG_SPEED_STATS 0
 #define CONFIG_TUNE_BUTTERAUGLI 0
 #define CONFIG_TUNE_VMAF 0
-#define CONFIG_USE_VMAF_RC 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
diff --git a/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h
index 58de231219..dd561e4498 100644
--- a/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h
+++ b/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h
@@ -6814,6 +6814,26 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a,
                                int width,
                                int height);
 
+void aom_ssim_parms_8x8_c(const uint8_t* s,
+                          int sp,
+                          const uint8_t* r,
+                          int rp,
+                          uint32_t* sum_s,
+                          uint32_t* sum_r,
+                          uint32_t* sum_sq_s,
+                          uint32_t* sum_sq_r,
+                          uint32_t* sum_sxr);
+void aom_ssim_parms_8x8_sse2(const uint8_t* s,
+                             int sp,
+                             const uint8_t* r,
+                             int rp,
+                             uint32_t* sum_s,
+                             uint32_t* sum_r,
+                             uint32_t* sum_sq_s,
+                             uint32_t* sum_sq_r,
+                             uint32_t* sum_sxr);
+#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
+
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr,
                                              int source_stride,
                                              int xoffset,
diff --git a/third_party/libaom/source/libaom/aom/aom.h b/third_party/libaom/source/libaom/aom/aom.h
index c591dc9a43..0650a11f6b 100644
--- a/third_party/libaom/source/libaom/aom/aom.h
+++ b/third_party/libaom/source/libaom/aom/aom.h
@@ -41,27 +41,45 @@ extern "C" {
 /*!\brief Control functions
  *
  * The set of macros define the control functions of AOM interface
+ * The range for common control IDs is 230-255(max).
  */
 enum aom_com_control_id {
-  /* TODO(https://crbug.com/aomedia/2671): The encoder overlaps the range of
-   * these values for its control ids, see the NOTEs in aom/aomcx.h. These
-   * should be migrated to something like the AOM_DECODER_CTRL_ID_START range
-   * next time we're ready to break the ABI.
+  /*!\brief Codec control function to get a pointer to a reference frame
+   *
+   * av1_ref_frame_t* parameter
    */
-  AV1_GET_REFERENCE = 128,  /**< get a pointer to a reference frame,
-                               av1_ref_frame_t* parameter */
-  AV1_SET_REFERENCE = 129,  /**< write a frame into a reference buffer,
-                               av1_ref_frame_t* parameter */
-  AV1_COPY_REFERENCE = 130, /**< get a copy of reference frame from the decoderm
-                               av1_ref_frame_t* parameter */
-  AOM_COMMON_CTRL_ID_MAX,
-
-  AV1_GET_NEW_FRAME_IMAGE =
-      192, /**< get a pointer to the new frame, aom_image_t* parameter */
-  AV1_COPY_NEW_FRAME_IMAGE = 193, /**< copy the new frame to an external buffer,
-                                     aom_image_t* parameter */
+  AV1_GET_REFERENCE = 230,
 
+  /*!\brief Codec control function to write a frame into a reference buffer
+   *
+   * av1_ref_frame_t* parameter
+   */
+  AV1_SET_REFERENCE = 231,
+
+  /*!\brief Codec control function to get a copy of reference frame from the
+   * decoder
+   *
+   * av1_ref_frame_t* parameter
+   */
+  AV1_COPY_REFERENCE = 232,
+
+  /*!\brief Codec control function to get a pointer to the new frame
+   *
+   * aom_image_t* parameter
+   */
+  AV1_GET_NEW_FRAME_IMAGE = 233,
+
+  /*!\brief Codec control function to copy the new frame to an external buffer
+   *
+   * aom_image_t* parameter
+   */
+  AV1_COPY_NEW_FRAME_IMAGE = 234,
+
+  /*!\brief Start point of control IDs for aom_dec_control_id.
+   * Any new common control IDs should be added above.
+   */
   AOM_DECODER_CTRL_ID_START = 256
+  // No common control IDs should be added after AOM_DECODER_CTRL_ID_START.
 };
 
 /*!\brief AV1 specific reference frame data struct
diff --git a/third_party/libaom/source/libaom/aom/aom_codec.h b/third_party/libaom/source/libaom/aom/aom_codec.h
index f58272ee2c..a2a9efaef3 100644
--- a/third_party/libaom/source/libaom/aom/aom_codec.h
+++ b/third_party/libaom/source/libaom/aom/aom_codec.h
@@ -149,7 +149,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define AOM_CODEC_ABI_VERSION (6 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
 /*!\brief Algorithm return codes */
 typedef enum {
diff --git a/third_party/libaom/source/libaom/aom/aom_encoder.h b/third_party/libaom/source/libaom/aom/aom_encoder.h
index 48e705646d..a98c8d8270 100644
--- a/third_party/libaom/source/libaom/aom/aom_encoder.h
+++ b/third_party/libaom/source/libaom/aom/aom_encoder.h
@@ -31,6 +31,7 @@ extern "C" {
 #endif
 
 #include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
 
 /*!\brief Current ABI version number
  *
@@ -41,7 +42,7 @@ extern "C" {
  * fields to structures
  */
 #define AOM_ENCODER_ABI_VERSION \
-  (9 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (9 + AOM_CODEC_ABI_VERSION + AOM_EXT_PART_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -142,15 +143,8 @@ typedef struct aom_codec_cx_pkt {
       double psnr_hbd[4];
     } psnr;              /**< data for PSNR packet */
     aom_fixed_buf_t raw; /**< data for arbitrary packets */
-
-    /* This packet size is fixed to allow codecs to extend this
-     * interface without having to manage storage for raw packets,
-     * i.e., if it's smaller than 128 bytes, you can store in the
-     * packet list directly.
-     */
-    char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
-  } data;                                               /**< packet data */
-} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
+  } data;                /**< packet data */
+} aom_codec_cx_pkt_t;    /**< alias for struct aom_codec_cx_pkt */
 
 /*!\brief Rational Number
  *
@@ -300,10 +294,6 @@ typedef struct cfg_options {
    *
    */
   unsigned int disable_smooth_intra;
-  /*!\brief disable D45 to D203 intra modes
-   *
-   */
-  unsigned int disable_diagonal_intra;
   /*!\brief disable filter intra
    *
    */
@@ -880,11 +870,11 @@ typedef struct aom_codec_enc_cfg {
    */
   unsigned int use_fixed_qp_offsets;
 
-/*!\brief Number of fixed QP offsets
+/*!\brief Max number of fixed QP offsets
  *
  * This defines the number of elements in the fixed_qp_offsets array.
  */
-#define FIXED_QP_OFFSET_COUNT 5
+#define FIXED_QP_OFFSET_COUNT 6
 
   /*!\brief Array of fixed QP offsets
    *
diff --git a/third_party/libaom/source/libaom/aom/aom_external_partition.h b/third_party/libaom/source/libaom/aom/aom_external_partition.h
new file mode 100644
index 0000000000..3710466316
--- /dev/null
+++ b/third_party/libaom/source/libaom/aom/aom_external_partition.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+#define AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include "./aom_integer.h"
+
+/*!\file
+ * \brief Provides function pointer definitions for the external partition.
+ */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define AOM_EXT_PART_ABI_VERSION (1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Abstract external partition model handler
+ */
+typedef void *aom_ext_part_model_t;
+
+/*!\brief Number of features to determine whether to skip partition none and
+ * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT".
+ */
+#define SIZE_DIRECT_SPLIT 17
+
+/*!\brief Number of features to use simple motion search to prune out
+ * rectangular partition in some direction. The same as
+ * "FEATURE_SIZE_SMS_PRUNE_PART".
+ */
+#define SIZE_PRUNE_PART 25
+
+/*!\brief Number of features to prune split and rectangular partition
+ * after PARTITION_NONE.
+ */
+#define SIZE_PRUNE_NONE 4
+
+/*!\brief Number of features to terminates partition after partition none using
+ * simple_motion_search features and the rate, distortion, and rdcost of
+ * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE".
+ */
+#define SIZE_TERM_NONE 28
+
+/*!\brief Number of features to terminates partition after partition split.
+ */
+#define SIZE_TERM_SPLIT 31
+
+/*!\brief Number of features to prune rectangular partition using stats
+ * collected after partition split.
+ */
+#define SIZE_PRUNE_RECT 9
+
+/*!\brief Number of features to prune AB partition using stats
+ * collected after rectangular partition..
+ */
+#define SIZE_PRUNE_AB 10
+
+/*!\brief Number of features to prune 4-way partition using stats
+ * collected after AB partition.
+ */
+#define SIZE_PRUNE_4_WAY 18
+
+/*!\brief Config information sent to the external partition model.
+ *
+ * For example, the maximum superblock size determined by the sequence header.
+ */
+typedef struct aom_ext_part_config {
+  int superblock_size; /**< super block size (either 64x64 or 128x128) */
+} aom_ext_part_config_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected before NONE partition.
+ * Features "f" are used to determine:
+ * partition_none_allowed, partition_horz_allowed, partition_vert_allowed,
+ * do_rectangular_split, do_square_split
+ * Features "f_part2" are used to determine:
+ * prune_horz, prune_vert.
+ */
+typedef struct aom_partition_features_before_none {
+  float f[SIZE_DIRECT_SPLIT]; /**< features to determine whether skip partition
+                                 none and do split directly */
+  float f_part2[SIZE_PRUNE_PART]; /**< features to determine whether to prune
+                                     rectangular partition */
+} aom_partition_features_before_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after NONE partition.
+ */
+typedef struct aom_partition_features_none {
+  float f[SIZE_PRUNE_NONE]; /**< features to prune split and rectangular
+                               partition*/
+  float f_terminate[SIZE_TERM_NONE]; /**< features to determine termination of
+                                        partition */
+} aom_partition_features_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after SPLIT partition.
+ */
+typedef struct aom_partition_features_split {
+  float f_terminate[SIZE_TERM_SPLIT];  /**< features to determine termination of
+                                          partition */
+  float f_prune_rect[SIZE_PRUNE_RECT]; /**< features to determine pruning rect
+                                          partition */
+} aom_partition_features_split_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after RECTANGULAR partition.
+ */
+typedef struct aom_partition_features_rect {
+  float f[SIZE_PRUNE_AB]; /**< features to determine pruning AB partition */
+} aom_partition_features_rect_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A,
+ * VERT_B.
+ */
+typedef struct aom_partition_features_ab {
+  float
+      f[SIZE_PRUNE_4_WAY]; /**< features to determine pruning 4-way partition */
+} aom_partition_features_ab_t;
+
+/*!\brief Feature id to tell the external model the current stage in partition
+ * pruning and what features to use to make decisions accordingly.
+ */
+typedef enum {
+  FEATURE_BEFORE_PART_NONE,
+  FEATURE_BEFORE_PART_NONE_PART2,
+  FEATURE_AFTER_PART_NONE,
+  FEATURE_AFTER_PART_NONE_PART2,
+  FEATURE_AFTER_PART_SPLIT,
+  FEATURE_AFTER_PART_SPLIT_PART2,
+  FEATURE_AFTER_PART_RECT,
+  FEATURE_AFTER_PART_AB
+} PART_FEATURE_ID;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ *
+ * The encoder sends these features to the external model through
+ * "func()" defined in .....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_features {
+  PART_FEATURE_ID id; /**< Feature ID to indicate active features */
+  aom_partition_features_before_none_t
+      before_part_none; /**< Features collected before NONE partition */
+  aom_partition_features_none_t
+      after_part_none; /**< Features collected after NONE partition */
+  aom_partition_features_split_t
+      after_part_split; /**< Features collected after SPLIT partition */
+  aom_partition_features_rect_t
+      after_part_rect; /**< Features collected after RECTANGULAR partition */
+  aom_partition_features_ab_t
+      after_part_ab; /**< Features collected after AB partition */
+} aom_partition_features_t;
+
+/*!\brief Partition decisions received from the external model.
+ *
+ * The encoder receives partition decisions and encodes the superblock
+ * with the given partition type.
+ * The encoder receives it from "func()" define in ....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_decision {
+  // Decisions for directly set partition types
+  int is_final_decision;       /**< The flag whether it is the final decision */
+  int partition_decision[256]; /**< Partition decisions */
+
+  // Decisions for partition type pruning
+  int terminate_partition_search; /**< Terminate further partition search */
+  int partition_none_allowed;     /**< Allow partition none type */
+  int partition_rect_allowed[2];  /**< Allow rectangular partitions */
+  int do_rectangular_split;       /**< Try rectangular split partition */
+  int do_square_split;            /**< Try square split partition */
+  int prune_rect_part[2];         /**< Prune rectangular partition */
+  int horza_partition_allowed;    /**< Allow HORZ_A partitioin */
+  int horzb_partition_allowed;    /**< Allow HORZ_B partitioin */
+  int verta_partition_allowed;    /**< Allow VERT_A partitioin */
+  int vertb_partition_allowed;    /**< Allow VERT_B partitioin */
+  int partition_horz4_allowed;    /**< Allow HORZ4 partition */
+  int partition_vert4_allowed;    /**< Allow VERT4 partition */
+} aom_partition_decision_t;
+
+/*!\brief Encoding stats for the given partition decision.
+ *
+ * The encoding stats collected by encoding the superblock with the
+ * given partition types.
+ * The encoder sends the stats to the external model for training
+ * or inference though "func()" defined in ....
+ */
+typedef struct aom_partition_stats {
+  int rate;       /**< Rate cost of the block */
+  int64_t dist;   /**< Distortion of the block */
+  int64_t rdcost; /**< Rate-distortion cost of the block */
+} aom_partition_stats_t;
+
+/*!\brief Enum for return status.
+ */
+typedef enum aom_ext_part_status {
+  AOM_EXT_PART_OK = 0,    /**< Status of success */
+  AOM_EXT_PART_ERROR = 1, /**< Status of failure */
+  AOM_EXT_PART_TEST = 2,  /**< Status used for tests */
+} aom_ext_part_status_t;
+
+/*!\brief Callback of creating an external partition model.
+ *
+ * The callback is invoked by the encoder to create an external partition
+ * model.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] part_config Config information pointer for model creation
+ * \param[out] ext_part_model Pointer to the model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model);
+
+/*!\brief Callback of sending features to the external partition model.
+ *
+ * The callback is invoked by the encoder to send features to the external
+ * partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] part_features Pointer to the features
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features);
+
+/*!\brief Callback of receiving partition decisions from the external
+ * partition model.
+ *
+ * The callback is invoked by the encoder to receive partition decisions from
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_decision Pointer to the partition decisions
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision);
+
+/*!\brief Callback of sending stats to the external partition model.
+ *
+ * The callback is invoked by the encoder to send encoding stats to
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_stats Pointer to the encoding stats
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats);
+
+/*!\brief Callback of deleting the external partition model.
+ *
+ * The callback is invoked by the encoder to delete the external partition
+ * model.
+ *
+ * \param[in] ext_part_model The external model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)(
+    aom_ext_part_model_t ext_part_model);
+
+/*!\brief Callback function set for external partition model.
+ *
+ * Uses can enable external partition model by registering a set of
+ * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL
+ */
+typedef struct aom_ext_part_funcs {
+  /*!
+   * Create an external partition model.
+   */
+  aom_ext_part_create_model_fn_t create_model;
+
+  /*!
+   * Send features to the external partition model to make partition decisions.
+   */
+  aom_ext_part_send_features_fn_t send_features;
+
+  /*!
+   * Get partition decisions from the external partition model.
+   */
+  aom_ext_part_get_decision_fn_t get_partition_decision;
+
+  /*!
+   * Send stats of the current partition to the external model.
+   */
+  aom_ext_part_send_partition_stats_fn_t send_partition_stats;
+
+  /*!
+   * Delete the external partition model.
+   */
+  aom_ext_part_delete_model_fn_t delete_model;
+
+  /*!
+   * Private data for the external partition model.
+   */
+  void *priv;
+} aom_ext_part_funcs_t;
+
+/*!@} - end defgroup aom_encoder*/
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_EXTERNAL_PARTITION_H_
diff --git a/third_party/libaom/source/libaom/aom/aomcx.h b/third_party/libaom/source/libaom/aom/aomcx.h
index 87f0b5db9b..8345911abd 100644
--- a/third_party/libaom/source/libaom/aom/aomcx.h
+++ b/third_party/libaom/source/libaom/aom/aomcx.h
@@ -18,6 +18,7 @@
  */
 #include "aom/aom.h"
 #include "aom/aom_encoder.h"
+#include "aom/aom_external_partition.h"
 
 /*!\file
  * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
@@ -167,6 +168,7 @@ extern aom_codec_iface_t *aom_codec_av1_cx(void);
  *
  * This set of macros define the control functions available for AVx
  * encoder interface.
+ * The range of encode control ID is 7-229(max).
  *
  * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
  */
@@ -221,10 +223,14 @@ enum aome_enc_control_id {
 
   /* NOTE: enum 15 unused */
 
-  /*!\brief Codec control function to set loop filter sharpness,
+  /*!\brief Codec control function to set the sharpness parameter,
    * unsigned int parameter.
    *
-   * Valid range: 0..7. The default is 0.
+   * This parameter controls the level at which rate-distortion optimization of
+   * transform coefficients favours sharpness in the block.
+   *
+   * Valid range: 0..7. The default is 0. Values 1-7 will avoid eob and skip
+   * block optimization and will change rdmult in favour of block sharpness.
    */
   AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,  // 16
 
@@ -1204,9 +1210,6 @@ enum aome_enc_control_id {
      parameter */
   AV1E_SET_REDUCED_REFERENCE_SET = 125,
 
-  /* NOTE: enums 126-139 unused */
-  /* NOTE: Need a gap in enum values to avoud conflict with 128, 129, 130 */
-
   /*!\brief Control to set frequency of the cost updates for coefficients,
    * unsigned int parameter
    *
@@ -1215,7 +1218,7 @@ enum aome_enc_control_id {
    * - 2 = update at tile level
    * - 3 = turn off
    */
-  AV1E_SET_COEFF_COST_UPD_FREQ = 140,
+  AV1E_SET_COEFF_COST_UPD_FREQ = 126,
 
   /*!\brief Control to set frequency of the cost updates for mode, unsigned int
    * parameter
@@ -1225,7 +1228,7 @@ enum aome_enc_control_id {
    * - 2 = update at tile level
    * - 3 = turn off
    */
-  AV1E_SET_MODE_COST_UPD_FREQ = 141,
+  AV1E_SET_MODE_COST_UPD_FREQ = 127,
 
   /*!\brief Control to set frequency of the cost updates for motion vectors,
    * unsigned int parameter
@@ -1235,7 +1238,7 @@ enum aome_enc_control_id {
    * - 2 = update at tile level
    * - 3 = turn off
    */
-  AV1E_SET_MV_COST_UPD_FREQ = 142,
+  AV1E_SET_MV_COST_UPD_FREQ = 128,
 
   /*!\brief Control to set bit mask that specifies which tier each of the 32
    * possible operating points conforms to, unsigned int parameter
@@ -1243,37 +1246,37 @@ enum aome_enc_control_id {
    * - 0 = main tier (default)
    * - 1 = high tier
    */
-  AV1E_SET_TIER_MASK = 143,
+  AV1E_SET_TIER_MASK = 129,
 
   /*!\brief Control to set minimum compression ratio, unsigned int parameter
    * Take integer values. If non-zero, encoder will try to keep the compression
    * ratio of each frame to be higher than the given value divided by 100.
    * E.g. 850 means minimum compression ratio of 8.5.
    */
-  AV1E_SET_MIN_CR = 144,
+  AV1E_SET_MIN_CR = 130,
 
   /* NOTE: enums 145-149 unused */
 
   /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t*
    * parameter
    */
-  AV1E_SET_SVC_LAYER_ID = 150,
+  AV1E_SET_SVC_LAYER_ID = 131,
 
   /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t*
    * parameter
    */
-  AV1E_SET_SVC_PARAMS = 151,
+  AV1E_SET_SVC_PARAMS = 132,
 
   /*!\brief Codec control function to set reference frame config:
    * the ref_idx and the refresh flags for each buffer slot.
    * aom_svc_ref_frame_config_t* parameter
    */
-  AV1E_SET_SVC_REF_FRAME_CONFIG = 152,
+  AV1E_SET_SVC_REF_FRAME_CONFIG = 133,
 
   /*!\brief Codec control function to set the path to the VMAF model used when
    * tuning the encoder for VMAF, const char* parameter
    */
-  AV1E_SET_VMAF_MODEL_PATH = 153,
+  AV1E_SET_VMAF_MODEL_PATH = 134,
 
   /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder,
    * unsigned int parameter
@@ -1283,7 +1286,7 @@ enum aome_enc_control_id {
    *
    * \note This is only used in lightfield example test.
    */
-  AV1E_ENABLE_EXT_TILE_DEBUG = 154,
+  AV1E_ENABLE_EXT_TILE_DEBUG = 135,
 
   /*!\brief Codec control function to enable the superblock multipass unit test
    * in AV1 to ensure that the encoder does not leak state between different
@@ -1294,30 +1297,30 @@ enum aome_enc_control_id {
    *
    * \note This is only used in sb_multipass unit test.
    */
-  AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 155,
+  AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 136,
 
   /*!\brief Control to select minimum height for the GF group pyramid structure,
    * unsigned int parameter
    *
    * Valid values: 0..5
    */
-  AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156,
+  AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 137,
 
   /*!\brief Control to set average complexity of the corpus in the case of
    * single pass vbr based on LAP, unsigned int parameter
    */
-  AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 157,
+  AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 138,
 
   /*!\brief Control to get baseline gf interval
    */
-  AV1E_GET_BASELINE_GF_INTERVAL = 158,
+  AV1E_GET_BASELINE_GF_INTERVAL = 139,
 
   /*\brief Control to set encoding the denoised frame from denoise-noise-level
    *
    * - 0 = disabled/encode the original frame
    * - 1 = enabled/encode the denoised frame (default)
    */
-  AV1E_SET_ENABLE_DNL_DENOISING = 159,
+  AV1E_SET_ENABLE_DNL_DENOISING = 140,
 
   /*!\brief Codec control function to turn on / off D45 to D203 intra mode
    * usage, int parameter
@@ -1327,7 +1330,32 @@ enum aome_enc_control_id {
    * - 0 = disable
    * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_DIAGONAL_INTRA = 160,
+  AV1E_SET_ENABLE_DIAGONAL_INTRA = 141,
+
+  /*!\brief Control to set frequency of the cost updates for intrabc motion
+   * vectors, unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_DV_COST_UPD_FREQ = 142,
+
+  /*!\brief Codec control to set the path for partition stats read and write.
+   * const char * parameter.
+   */
+  AV1E_SET_PARTITION_INFO_PATH = 143,
+
+  /*!\brief Codec control to use an external partition model
+   * A set of callback functions is passed through this control
+   * to let the encoder encode with given partitions.
+   */
+  AV1E_SET_EXTERNAL_PARTITION = 144,
+
+  // Any new encoder control IDs should be added above.
+  // Maximum allowed encoder control ID is 229.
+  // No encoder control ID should be added below.
 };
 
 /*!\brief aom 1-D scaling mode
@@ -1858,6 +1886,15 @@ AOM_CTRL_USE_TYPE(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DNL_DENOISING, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING
 
+AOM_CTRL_USE_TYPE(AV1E_SET_DV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_DV_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_PARTITION_INFO_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_PARTITION_INFO_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_EXTERNAL_PARTITION, aom_ext_part_funcs_t *)
+#define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/third_party/libaom/source/libaom/aom/aomdx.h b/third_party/libaom/source/libaom/aom/aomdx.h
index aa4f435ec4..b3fd90e460 100644
--- a/third_party/libaom/source/libaom/aom/aomdx.h
+++ b/third_party/libaom/source/libaom/aom/aomdx.h
@@ -188,6 +188,7 @@ typedef struct av1_ext_ref_frame {
  *
  * This set of macros define the control functions available for the AOM
  * decoder interface.
+ * The range for decoder control ID is >= 256.
  *
  * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
  */
@@ -381,8 +382,6 @@ enum aom_dec_control_id {
    */
   AV1D_SET_SKIP_FILM_GRAIN,
 
-  AOM_DECODER_CTRL_ID_MAX,
-
   /*!\brief Codec control function to check the presence of forward key frames
    */
   AOMD_GET_FWD_KF_PRESENT,
diff --git a/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h b/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h
index 0ad33bdf2e..457da9244a 100644
--- a/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h
+++ b/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h
@@ -278,7 +278,7 @@ typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
 typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
     aom_codec_alg_priv_t *ctx);
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */
diff --git a/third_party/libaom/source/libaom/aom/src/aom_image.c b/third_party/libaom/source/libaom/aom/src/aom_image.c
index dfdee87d26..13f71b2bf5 100644
--- a/third_party/libaom/source/libaom/aom/src/aom_image.c
+++ b/third_party/libaom/source/libaom/aom/src/aom_image.c
@@ -38,6 +38,8 @@ static aom_image_t *img_alloc_helper(
   unsigned int h, w, s, xcs, ycs, bps, bit_depth;
   unsigned int stride_in_bytes;
 
+  if (img != NULL) memset(img, 0, sizeof(aom_image_t));
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -111,8 +113,6 @@ static aom_image_t *img_alloc_helper(
     if (!img) goto fail;
 
     img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(aom_image_t));
   }
 
   img->img_data = img_data;
diff --git a/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c b/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c
index 6d41708ee0..945e7e48ee 100644
--- a/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c
@@ -11,8 +11,6 @@
 
 #include <arm_neon.h>
 
-#include "common/tools_common.h"
-
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
diff --git a/third_party/libaom/source/libaom/aom_dsp/butteraugli.c b/third_party/libaom/source/libaom/aom_dsp/butteraugli.c
index 7ce2324c06..038efcd313 100644
--- a/third_party/libaom/source/libaom/aom_dsp/butteraugli.c
+++ b/third_party/libaom/source/libaom/aom_dsp/butteraugli.c
@@ -18,37 +18,71 @@
 
 int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                         float *dist_map) {
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map) {
   (void)bit_depth;
   assert(bit_depth == 8);
-  assert(source->y_width == source->uv_width * 2);
   const int width = source->y_crop_width;
   const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
 
-  size_t buffer_size = width * height * 3;
-  uint8_t *src_rgb = (uint8_t *)aom_malloc(buffer_size);
-  uint8_t *distorted_rgb = (uint8_t *)aom_malloc(buffer_size);
-  if (!src_rgb || !distorted_rgb) {
-    aom_free(src_rgb);
-    aom_free(distorted_rgb);
+  const struct YuvConstants *yuv_constants;
+  if (matrix_coefficients == AOM_CICP_MC_BT_709) {
+    if (color_range == AOM_CR_FULL_RANGE) return 0;
+    yuv_constants = &kYuvH709Constants;
+  } else {
+    yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants
+                                                     : &kYuvI601Constants;
+  }
+
+  const size_t stride_argb = width * 4;
+  const size_t buffer_size = height * stride_argb;
+  uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size);
+  uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size);
+  if (!src_argb || !distorted_argb) {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
     return 0;
   }
 
-  I420ToRGB24Matrix(source->y_buffer, source->y_stride, source->u_buffer,
-                    source->uv_stride, source->v_buffer, source->uv_stride,
-                    src_rgb, width * 3, &kYuvH709Constants, width, height);
-  I420ToRGB24Matrix(distorted->y_buffer, distorted->y_stride,
-                    distorted->u_buffer, distorted->uv_stride,
-                    distorted->v_buffer, distorted->uv_stride, distorted_rgb,
-                    width * 3, &kYuvH709Constants, width, height);
+  if (ss_x == 1 && ss_y == 1) {
+    I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 1 && ss_y == 0) {
+    I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 0 && ss_y == 0) {
+    I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
 
-  JxlPixelFormat pixel_format = { 3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
+  JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
   JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL);
   JxlButteraugliApiSetHFAsymmetry(api, 0.8f);
 
   JxlButteraugliResult *result = JxlButteraugliCompute(
-      api, width, height, &pixel_format, src_rgb, buffer_size, &pixel_format,
-      distorted_rgb, buffer_size);
+      api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format,
+      distorted_argb, buffer_size);
 
   const float *distmap = NULL;
   uint32_t row_stride;
@@ -56,8 +90,8 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
   if (distmap == NULL) {
     JxlButteraugliApiDestroy(api);
     JxlButteraugliResultDestroy(result);
-    aom_free(src_rgb);
-    aom_free(distorted_rgb);
+    aom_free(src_argb);
+    aom_free(distorted_argb);
     return 0;
   }
 
@@ -69,7 +103,7 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
 
   JxlButteraugliApiDestroy(api);
   JxlButteraugliResultDestroy(result);
-  aom_free(src_rgb);
-  aom_free(distorted_rgb);
+  aom_free(src_argb);
+  aom_free(distorted_argb);
   return 1;
 }
diff --git a/third_party/libaom/source/libaom/aom_dsp/butteraugli.h b/third_party/libaom/source/libaom/aom_dsp/butteraugli.h
index 06402aa3e4..5304092ccb 100644
--- a/third_party/libaom/source/libaom/aom_dsp/butteraugli.h
+++ b/third_party/libaom/source/libaom/aom_dsp/butteraugli.h
@@ -14,8 +14,10 @@
 
 #include "aom_scale/yv12config.h"
 
+// Returns a boolean that indicates success/failure.
 int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                         float *dist_map);
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map);
 
 #endif  // AOM_AOM_DSP_BUTTERAUGLI_H_
diff --git a/third_party/libaom/source/libaom/aom_dsp/fastssim.c b/third_party/libaom/source/libaom/aom_dsp/fastssim.c
index 3804519b31..89712c5f40 100644
--- a/third_party/libaom/source/libaom/aom_dsp/fastssim.c
+++ b/third_party/libaom/source/libaom/aom_dsp/fastssim.c
@@ -31,6 +31,7 @@ typedef struct fs_ctx fs_ctx;
 #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
 #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
 #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#define MAX_SSIM_DB 100.0
 
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
diff --git a/third_party/libaom/source/libaom/aom_dsp/grain_table.c b/third_party/libaom/source/libaom/aom_dsp/grain_table.c
index e03f04d5da..b22752abd9 100644
--- a/third_party/libaom/source/libaom/aom_dsp/grain_table.c
+++ b/third_party/libaom/source/libaom/aom_dsp/grain_table.c
@@ -202,7 +202,7 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
                                 int64_t end_time, int erase,
                                 aom_film_grain_t *grain) {
   aom_film_grain_table_entry_t *entry = t->head;
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   uint16_t random_seed = grain ? grain->random_seed : 0;
   if (grain) memset(grain, 0, sizeof(*grain));
 
@@ -241,10 +241,10 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
         entry->end_time = time_stamp;
         if (t->tail == entry) t->tail = new_entry;
       }
-      // If segments aren't aligned, delete from the beggining of subsequent
+      // If segments aren't aligned, delete from the beginning of subsequent
       // segments
       if (end_time > entry_end_time) {
-        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+        aom_film_grain_table_lookup(t, entry_end_time, end_time, 1, 0);
       }
       return 1;
     }
@@ -275,12 +275,12 @@ aom_codec_err_t aom_film_grain_table_read(
     return error_info->error_code;
   }
 
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   while (!feof(file)) {
     aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
     memset(entry, 0, sizeof(*entry));
     grain_table_entry_read(file, error_info, entry);
-    entry->next = 0;
+    entry->next = NULL;
 
     if (prev_entry) prev_entry->next = entry;
     if (!t->head) t->head = entry;
diff --git a/third_party/libaom/source/libaom/aom_dsp/noise_model.c b/third_party/libaom/source/libaom/aom_dsp/noise_model.c
index f56fdd5860..19c660e911 100644
--- a/third_party/libaom/source/libaom/aom_dsp/noise_model.c
+++ b/third_party/libaom/source/libaom/aom_dsp/noise_model.c
@@ -214,6 +214,7 @@ static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
 
 int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
   if (!lut) return 0;
+  if (num_points <= 0) return 0;
   lut->num_points = 0;
   lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
   if (!lut->points) return 0;
@@ -1152,12 +1153,24 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
 
   // Convert the scaling functions to 8 bit values
   aom_noise_strength_lut_t scaling_points[3];
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[0].strength_solver, 14,
+          scaling_points + 0)) {
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[1].strength_solver, 10,
+          scaling_points + 1)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[2].strength_solver, 10,
+          scaling_points + 2)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    aom_noise_strength_lut_free(scaling_points + 1);
+    return 0;
+  }
 
   // Both the domain and the range of the scaling functions in the film_grain
   // are normalized to 8-bit (e.g., they are implicitly scaled during grain
diff --git a/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c b/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c
index 69a1d99bf2..25f075aa2f 100644
--- a/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c
+++ b/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c
@@ -34,6 +34,7 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -43,6 +44,7 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
     for (j = 0; j < 8; j++)
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
@@ -210,6 +212,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         }
       }
       s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+#if CONFIG_AV1_HIGHBITDEPTH
       if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
@@ -217,6 +220,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       }
+#else
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
diff --git a/third_party/libaom/source/libaom/aom_dsp/ssim.c b/third_party/libaom/source/libaom/aom_dsp/ssim.c
index 357da99ae4..c5334fd2c5 100644
--- a/third_party/libaom/source/libaom/aom_dsp/ssim.c
+++ b/third_party/libaom/source/libaom/aom_dsp/ssim.c
@@ -18,6 +18,7 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
+#if CONFIG_INTERNAL_STATS
 void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                             uint32_t *sum_s, uint32_t *sum_r,
                             uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -33,6 +34,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
     }
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
 
 void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
@@ -49,24 +51,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
   }
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
-void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
-                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
-                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                                 uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-#endif
-
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
 static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
@@ -78,7 +62,7 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
   double ssim_n, ssim_d;
-  int64_t c1, c2;
+  int64_t c1 = 0, c2 = 0;
   if (bd == 8) {
     // scale the constants by number of pixels
     c1 = (cc1 * count * count) >> 12;
@@ -90,8 +74,9 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     c1 = (cc1_12 * count * count) >> 12;
     c2 = (cc2_12 * count * count) >> 12;
   } else {
-    c1 = c2 = 0;
     assert(0);
+    // Return similarity as zero for unsupported bit-depth values.
+    return 0;
   }
 
   ssim_n = (2.0 * sum_s * sum_r + c1) *
@@ -111,21 +96,11 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
-static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, uint32_t bd, uint32_t shift) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                            &sum_sxr);
-  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
-}
-
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
-                        int stride_img1, int stride_img2, int width,
-                        int height) {
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -143,31 +118,10 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
   return ssim_total;
 }
 
-static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
-                               int stride_img1, int stride_img2, int width,
-                               int height, uint32_t bd, uint32_t shift) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
-                                 shift);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *weight,
-                   double *fast_ssim) {
+#if CONFIG_INTERNAL_STATS
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim) {
   double abc[3];
   for (int i = 0; i < 3; ++i) {
     const int is_uv = i > 0;
@@ -421,7 +375,57 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   m->dssim = dssim_total;
   return inconsistency_total;
 }
+#endif  // CONFIG_INTERNAL_STATS
 
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
 void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                           const YV12_BUFFER_CONFIG *dest, double *weight,
                           uint32_t bd, uint32_t in_bd, double *fast_ssim) {
@@ -455,3 +459,25 @@ void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
     fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if CONFIG_INTERNAL_STATS
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_hbd) {
+    aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+                         frame_ssim2);
+    return;
+  }
+#else
+  (void)bit_depth;
+  (void)in_bit_depth;
+  (void)is_hbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2);
+}
+#endif  // CONFIG_INTERNAL_STATS
diff --git a/third_party/libaom/source/libaom/aom_dsp/ssim.h b/third_party/libaom/source/libaom/aom_dsp/ssim.h
index d635ef5bbe..fb92556a8c 100644
--- a/third_party/libaom/source/libaom/aom_dsp/ssim.h
+++ b/third_party/libaom/source/libaom/aom_dsp/ssim.h
@@ -12,14 +12,13 @@
 #ifndef AOM_AOM_DSP_SSIM_H_
 #define AOM_AOM_DSP_SSIM_H_
 
-#define MAX_SSIM_DB 100.0;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include "config/aom_config.h"
 
+#if CONFIG_INTERNAL_STATS
 #include "aom_scale/yv12config.h"
 
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
@@ -68,18 +67,35 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
                             int img2_pitch, int width, int height, Ssimv *sv2,
                             Metrics *m, int do_inconsistency);
 
-void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *weight,
-                   double *fast_ssim);
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim);
 
 double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest, double *ssim_y,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                           const YV12_BUFFER_CONFIG *dest, double *weight,
                           uint32_t bd, uint32_t in_bd, double *fast_ssim);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2);
+#endif  // CONFIG_INTERNAL_STATS
+
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/libaom/source/libaom/aom_dsp/vmaf.c b/third_party/libaom/source/libaom/aom_dsp/vmaf.c
index 41653430c1..219e278303 100644
--- a/third_party/libaom/source/libaom/aom_dsp/vmaf.c
+++ b/third_party/libaom/source/libaom/aom_dsp/vmaf.c
@@ -12,9 +12,6 @@
 #include "aom_dsp/vmaf.h"
 
 #include <assert.h>
-#if !CONFIG_USE_VMAF_RC
-#include <libvmaf.h>
-#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -24,10 +21,7 @@
 #include <unistd.h>
 #endif
 
-#if CONFIG_USE_VMAF_RC
-#include <libvmaf/libvmaf.rc.h>
-#endif
-
+#include <libvmaf/libvmaf.h>
 #include "aom_dsp/blend.h"
 #include "aom_ports/system_state.h"
 
@@ -36,162 +30,18 @@ static void vmaf_fatal_error(const char *message) {
   exit(EXIT_FAILURE);
 }
 
-#if !CONFIG_USE_VMAF_RC
-typedef struct FrameData {
-  const YV12_BUFFER_CONFIG *source;
-  const YV12_BUFFER_CONFIG *distorted;
-  int frame_set;
-  int bit_depth;
-} FrameData;
-
-// A callback function used to pass data to VMAF.
-// Returns 0 after reading a frame.
-// Returns 2 when there is no more frame to read.
-static int read_frame(float *ref_data, float *main_data, float *temp_data,
-                      int stride, void *user_data) {
-  FrameData *frames = (FrameData *)user_data;
-
-  if (!frames->frame_set) {
-    const int width = frames->source->y_width;
-    const int height = frames->source->y_height;
-    assert(width == frames->distorted->y_width);
-    assert(height == frames->distorted->y_height);
-
-    if (frames->source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
-      uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
-      uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = scale_factor * (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = scale_factor * (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
-    } else {
-      uint8_t *ref_ptr = frames->source->y_buffer;
-      uint8_t *main_ptr = frames->distorted->y_buffer;
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
-    }
-    frames->frame_set = 1;
-    return 0;
-  }
-
-  (void)temp_data;
-  return 2;
-}
-
-void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *distorted, const int bit_depth,
-                   double *const vmaf) {
-  aom_clear_system_state();
-  const int width = source->y_width;
-  const int height = source->y_height;
-  FrameData frames = { source, distorted, 0, bit_depth };
-  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
-  double vmaf_score;
-  const int ret =
-      compute_vmaf(&vmaf_score, fmt, width, height, read_frame,
-                   /*user_data=*/&frames, (char *)model_path,
-                   /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1,
-                   /*disable_avx=*/0, /*enable_transform=*/0,
-                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  if (ret) vmaf_fatal_error("Failed to compute VMAF scores.");
-
-  aom_clear_system_state();
-  *vmaf = vmaf_score;
-}
-
-void aom_calc_vmaf_multi_frame(void *user_data, const char *model_path,
-                               int (*rd_frm)(float *ref_data, float *main_data,
-                                             float *temp_data, int stride_byte,
-                                             void *user_data),
-                               int frame_width, int frame_height, int bit_depth,
-                               double *vmaf) {
-  aom_clear_system_state();
-
-  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
-  int log_path_length = snprintf(NULL, 0, "vmaf_scores_%d.xml", getpid()) + 1;
-  char *log_path = malloc(log_path_length);
-  snprintf(log_path, log_path_length, "vmaf_scores_%d.xml", getpid());
-  double vmaf_score;
-  const int ret =
-      compute_vmaf(&vmaf_score, fmt, frame_width, frame_height, rd_frm,
-                   /*user_data=*/user_data, (char *)model_path,
-                   /*log_path=*/log_path, /*log_fmt=*/NULL, /*disable_clip=*/0,
-                   /*disable_avx=*/0, /*enable_transform=*/0,
-                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  FILE *vmaf_log = fopen(log_path, "r");
-  free(log_path);
-  log_path = NULL;
-  if (vmaf_log == NULL || ret) {
-    vmaf_fatal_error("Failed to compute VMAF scores.");
-  }
-
-  int frame_index = 0;
-  char buf[512];
-  while (fgets(buf, 511, vmaf_log) != NULL) {
-    if (memcmp(buf, "\t\t<frame ", 9) == 0) {
-      char *p = strstr(buf, "vmaf=");
-      if (p != NULL && p[5] == '"') {
-        char *p2 = strstr(&p[6], "\"");
-        *p2 = '\0';
-        const double score = atof(&p[6]);
-        if (score < 0.0 || score > 100.0) {
-          vmaf_fatal_error("Failed to compute VMAF scores.");
-        }
-        vmaf[frame_index++] = score;
-      }
-    }
-  }
-  fclose(vmaf_log);
-
-  aom_clear_system_state();
-}
-#endif
-
-#if CONFIG_USE_VMAF_RC
-void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path) {
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) {
   if (*vmaf_model != NULL) return;
   VmafModelConfig model_cfg;
   model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP;
   model_cfg.name = "vmaf";
-  model_cfg.path = (char *)model_path;
 
-  if (vmaf_model_load_from_path(vmaf_model, &model_cfg)) {
+  if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) {
     vmaf_fatal_error("Failed to load VMAF model.");
   }
 }
 
-void aom_close_vmaf_model_rc(VmafModel *vmaf_model) {
+void aom_close_vmaf_model(VmafModel *vmaf_model) {
   vmaf_model_destroy(vmaf_model);
 }
 
@@ -221,8 +71,9 @@ static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src,
   }
 }
 
-void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
-                              bool cal_vmaf_neg) {
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg) {
+  // TODO(sdeng): make them CLI arguments.
   VmafConfiguration cfg;
   cfg.log_level = VMAF_LOG_LEVEL_NONE;
   cfg.n_threads = 0;
@@ -233,41 +84,53 @@ void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
     vmaf_fatal_error("Failed to init VMAF context.");
   }
 
-  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
-    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
-  }
-
   if (cal_vmaf_neg) {
     VmafFeatureDictionary *vif_feature = NULL;
-    vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", "1.0");
-    if (vmaf_use_feature(*vmaf_context, "float_vif", vif_feature)) {
+    if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set vif_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) {
       vmaf_fatal_error("Failed to use feature float_vif.");
     }
 
     VmafFeatureDictionary *adm_feature = NULL;
-    vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", "1.0");
-    if (vmaf_use_feature(*vmaf_context, "float_adm", adm_feature)) {
+    if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set adm_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) {
       vmaf_fatal_error("Failed to use feature float_adm.");
     }
   }
 
   VmafFeatureDictionary *motion_force_zero = NULL;
-  vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", "true");
-  if (vmaf_use_feature(*vmaf_context, "float_motion", motion_force_zero)) {
+  if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero",
+                                  "1")) {
+    vmaf_fatal_error("Failed to set motion_force_zero.");
+  }
+  if (vmaf_model_feature_overload(vmaf_model, "float_motion",
+                                  motion_force_zero)) {
     vmaf_fatal_error("Failed to use feature float_motion.");
   }
+
+  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
+    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
+  }
 }
 
-void aom_close_vmaf_context_rc(VmafContext *vmaf_context) {
+void aom_close_vmaf_context(VmafContext *vmaf_context) {
   if (vmaf_close(vmaf_context)) {
     vmaf_fatal_error("Failed to close VMAF context.");
   }
 }
 
-void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
-                               const YV12_BUFFER_CONFIG *source,
-                               const YV12_BUFFER_CONFIG *distorted,
-                               int bit_depth, int frame_index, double *vmaf) {
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   bool cal_vmaf_neg, double *vmaf) {
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg);
+  const int frame_index = 0;
   VmafPicture ref, dist;
   if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
                          source->y_height) ||
@@ -282,10 +145,50 @@ void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
     vmaf_fatal_error("Failed to read VMAF pictures.");
   }
 
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+
   vmaf_picture_unref(&ref);
   vmaf_picture_unref(&dist);
 
   vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index);
+  aom_close_vmaf_context(vmaf_context);
 }
 
-#endif  // CONFIG_USE_VMAF_RC
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index) {
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+}
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index) {
+  double vmaf;
+  if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) {
+    vmaf_fatal_error("Failed to calc VMAF scores.");
+  }
+  return vmaf;
+}
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context) {
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+}
diff --git a/third_party/libaom/source/libaom/aom_dsp/vmaf.h b/third_party/libaom/source/libaom/aom_dsp/vmaf.h
index d9da223e29..3ba8c8d565 100644
--- a/third_party/libaom/source/libaom/aom_dsp/vmaf.h
+++ b/third_party/libaom/source/libaom/aom_dsp/vmaf.h
@@ -15,33 +15,28 @@
 #include <stdbool.h>
 #include "aom_scale/yv12config.h"
 
-#if CONFIG_USE_VMAF_RC
 typedef struct VmafContext VmafContext;
 typedef struct VmafModel VmafModel;
-#endif
-
-#if CONFIG_USE_VMAF_RC
-void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
-                              bool cal_vmaf_neg);
-void aom_close_vmaf_context_rc(VmafContext *vmaf_context);
-
-void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path);
-void aom_close_vmaf_model_rc(VmafModel *vmaf_model);
-
-void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
-                               const YV12_BUFFER_CONFIG *source,
-                               const YV12_BUFFER_CONFIG *distorted,
-                               int bit_depth, int frame_index, double *vmaf);
-#else
-void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg);
+void aom_close_vmaf_context(VmafContext *vmaf_context);
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path);
+void aom_close_vmaf_model(VmafModel *vmaf_model);
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
                    const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                   double *vmaf);
-
-void aom_calc_vmaf_multi_frame(
-    void *user_data, const char *model_path,
-    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
-                      int stride_byte, void *user_data),
-    int frame_width, int frame_height, int bit_depth, double *vmaf);
-#endif  // CONFIG_USE_VMAF_RC
+                   bool cal_vmaf_neg, double *vmaf);
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index);
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index);
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context);
 
 #endif  // AOM_AOM_DSP_VMAF_H_
diff --git a/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm
index 58f1ac964e..a2510d5e7f 100644
--- a/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -20,20 +20,21 @@ SECTION .text
 ; Arg 2: Height
 ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
 ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
-%macro HIGH_SAD_FN 4
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
 %if %4 == 0
 %if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %elif %4 == 1 ; avg
 %if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, %5, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -356,7 +357,7 @@ HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
+  HIGH_SAD_FN 8, %1, 7, %2, 8
 %if %2 == 2  ; skip rows, so divide number of rows by 2
   mov              n_rowsd, %1/8
 %else
@@ -377,22 +378,30 @@ HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
   pavgw                 m4, [second_predq+mmsize*3]
   lea         second_predq, [second_predq+mmsize*4]
 %endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
+  mova                  m7, m1
+  movu                  m5, [srcq]
+  psubusw               m1, m5
+  psubusw               m5, m7
   por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
+
+  mova                  m7, m2
+  movu                  m5, [srcq+src_strideq*2]
+  psubusw               m2, m5
+  psubusw               m5, m7
   por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
+
+  mova                  m7, m3
+  movu                  m5, [srcq+src_strideq*4]
+  psubusw               m3, m5
+  psubusw               m5, m7
   por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
+
+  mova                  m7, m4
+  movu                  m5, [srcq+src_stride3q*2]
+  psubusw               m4, m5
+  psubusw               m5, m7
   por                   m4, m5
+
   paddw                 m1, m2
   paddw                 m3, m4
   movhlps               m2, m1
diff --git a/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c b/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c
index f779270ae3..163e4cc566 100644
--- a/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c
@@ -616,7 +616,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-    } else if (y_offset == 8) {
+    } else if (y_offset == 4) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
         LOAD_SRC_DST
@@ -652,8 +652,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         dst += dst_stride;
       }
     }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
     if (y_offset == 0) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
@@ -668,8 +668,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
@@ -691,7 +691,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         CALC_SUM_SSE_INSIDE_LOOP
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 4  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
@@ -741,8 +741,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
       filter = _mm256_load_si256(
diff --git a/third_party/libaom/source/libaom/apps/aomenc.c b/third_party/libaom/source/libaom/apps/aomenc.c
index 11035bf129..c09c3ca9c2 100644
--- a/third_party/libaom/source/libaom/apps/aomenc.c
+++ b/third_party/libaom/source/libaom/apps/aomenc.c
@@ -227,6 +227,8 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
 #if CONFIG_TUNE_VMAF
                                         AV1E_SET_VMAF_MODEL_PATH,
 #endif
+                                        AV1E_SET_DV_COST_UPD_FREQ,
+                                        AV1E_SET_PARTITION_INFO_PATH,
                                         0 };
 
 const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help,
@@ -422,6 +424,8 @@ const arg_def_t *av1_ctrl_args[] = {
 #if CONFIG_TUNE_VMAF
   &g_av1_codec_arg_defs.vmaf_model_path,
 #endif
+  &g_av1_codec_arg_defs.dv_cost_upd_freq,
+  &g_av1_codec_arg_defs.partition_info_path,
   NULL,
 };
 
@@ -505,6 +509,7 @@ struct stream_config {
 #if CONFIG_TUNE_VMAF
   const char *vmaf_model_path;
 #endif
+  const char *partition_info_path;
   aom_color_range_t color_range;
 };
 
@@ -681,6 +686,8 @@ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
 
   if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
     warn("Enforcing one-pass encoding in realtime mode\n");
+    if (global->pass > 1)
+      die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass);
     global->passes = 1;
   }
 
@@ -853,9 +860,9 @@ static void set_config_arg_key_vals(struct stream_config *config,
   }
 
   /* Point either to the next free element or the first instance of this
-   * control.
+   * option.
    */
-  for (j = 0; j < config->arg_ctrl_cnt; j++)
+  for (j = 0; j < config->arg_key_val_cnt; j++)
     if (strcmp(name, config->arg_key_vals[j][0]) == 0) break;
 
   /* Update/insert */
@@ -1071,6 +1078,9 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) {
       config->vmaf_model_path = arg.val;
 #endif
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path,
+                         argi)) {
+      config->partition_info_path = arg.val;
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets,
                          argi)) {
       config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg);
@@ -1078,9 +1088,14 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       const int fixed_qp_offset_count = arg_parse_list(
           &arg, config->cfg.fixed_qp_offsets, FIXED_QP_OFFSET_COUNT);
       if (fixed_qp_offset_count < FIXED_QP_OFFSET_COUNT) {
-        die("Option --fixed_qp_offsets requires %d comma-separated values, but "
-            "only %d values were provided.\n",
-            FIXED_QP_OFFSET_COUNT, fixed_qp_offset_count);
+        if (fixed_qp_offset_count < 2) {
+          die("Option --fixed_qp_offsets requires at least 2 comma-separated "
+              "values for kf and arf, but only %d were provided.\n",
+              fixed_qp_offset_count);
+        }
+        for (int k = fixed_qp_offset_count; k < FIXED_QP_OFFSET_COUNT; ++k)
+          config->cfg.fixed_qp_offsets[k] =
+              (config->cfg.fixed_qp_offsets[k - 1] + 1) / 2;
       }
       config->cfg.use_fixed_qp_offsets = 1;
     } else if (global->usage == AOM_USAGE_REALTIME &&
@@ -1301,7 +1316,6 @@ static void show_stream_config(struct stream_state *stream,
     SHOW_PARAMS(disable_intrabc);
     SHOW_PARAMS(disable_cfl);
     SHOW_PARAMS(disable_smooth_intra);
-    SHOW_PARAMS(disable_diagonal_intra);
     SHOW_PARAMS(disable_filter_intra);
     SHOW_PARAMS(disable_dual_filter);
     SHOW_PARAMS(disable_intra_angle_delta);
@@ -1437,6 +1451,11 @@ static void initialize_encoder(struct stream_state *stream,
                                   stream->config.vmaf_model_path);
   }
 #endif
+  if (stream->config.partition_info_path) {
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                  AV1E_SET_PARTITION_INFO_PATH,
+                                  stream->config.partition_info_path);
+  }
 
   if (stream->config.film_grain_filename) {
     AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
@@ -1473,6 +1492,33 @@ static void initialize_encoder(struct stream_state *stream,
 #endif
 }
 
+// Convert the input image 'img' to a monochrome image. The Y plane of the
+// output image is a shallow copy of the Y plane of the input image, therefore
+// the input image must remain valid for the lifetime of the output image. The U
+// and V planes of the output image are set to null pointers. The output image
+// format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400.
+static void convert_image_to_monochrome(const struct aom_image *img,
+                                        struct aom_image *monochrome_img) {
+  *monochrome_img = *img;
+  monochrome_img->fmt = AOM_IMG_FMT_I420;
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  }
+  monochrome_img->monochrome = 1;
+  monochrome_img->csp = AOM_CSP_UNKNOWN;
+  monochrome_img->x_chroma_shift = 1;
+  monochrome_img->y_chroma_shift = 1;
+  monochrome_img->planes[AOM_PLANE_U] = NULL;
+  monochrome_img->planes[AOM_PLANE_V] = NULL;
+  monochrome_img->stride[AOM_PLANE_U] = 0;
+  monochrome_img->stride[AOM_PLANE_V] = 0;
+  monochrome_img->sz = 0;
+  monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  monochrome_img->img_data = NULL;
+  monochrome_img->img_data_owner = 0;
+  monochrome_img->self_allocd = 0;
+}
+
 static void encode_frame(struct stream_state *stream,
                          struct AvxEncoderConfig *global, struct aom_image *img,
                          unsigned int frames_in) {
@@ -1552,6 +1598,12 @@ static void encode_frame(struct stream_state *stream,
 #endif
   }
 
+  struct aom_image monochrome_img;
+  if (img && cfg->monochrome) {
+    convert_image_to_monochrome(img, &monochrome_img);
+    img = &monochrome_img;
+  }
+
   aom_usec_timer_start(&timer);
   aom_codec_encode(&stream->encoder, img, frame_start,
                    (uint32_t)(next_frame_start - frame_start), 0);
@@ -1941,8 +1993,10 @@ int main(int argc, const char **argv_) {
                 stream->config.cfg.g_profile = 1;
                 profile_updated = 1;
               }
-            } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
-                       input.fmt == AOM_IMG_FMT_I42216) {
+            } else if (input.bit_depth == 12 ||
+                       ((input.fmt == AOM_IMG_FMT_I422 ||
+                         input.fmt == AOM_IMG_FMT_I42216) &&
+                        !stream->config.cfg.monochrome)) {
               stream->config.cfg.g_profile = 2;
               profile_updated = 1;
             }
diff --git a/third_party/libaom/source/libaom/av1/arg_defs.c b/third_party/libaom/source/libaom/av1/arg_defs.c
index e79f9b2934..8646b09c9d 100644
--- a/third_party/libaom/source/libaom/av1/arg_defs.c
+++ b/third_party/libaom/source/libaom/av1/arg_defs.c
@@ -271,7 +271,9 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
   .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
                         "Noise sensitivity (frames to blur)"),
   .sharpness = ARG_DEF(NULL, "sharpness", 1,
-                       "Loop filter sharpness (0..7), default is 0"),
+                       "Bias towards block sharpness in rate-distortion "
+                       "optimization of transform coefficients "
+                       "(0..7), default is 0"),
   .static_thresh =
       ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"),
   .auto_altref =
@@ -448,13 +450,16 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
               "Use Default-transform only for INTRA modes"),
   .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"),
   .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
-                                 "Update freq for coeff costs"
+                                 "Update freq for coeff costs. "
                                  "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
   .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1,
-                                "Update freq for mode costs"
+                                "Update freq for mode costs. "
                                 "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
   .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1,
-                              "Update freq for mv costs"
+                              "Update freq for mv costs. "
+                              "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+  .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1,
+                              "Update freq for dv costs. "
                               "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
   .num_tg = ARG_DEF(NULL, "num-tile-groups", 1,
                     "Maximum number of tile groups, default is 1"),
@@ -471,6 +476,8 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
   .vmaf_model_path =
       ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"),
 #endif
+  .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1,
+                                 "Partition information read and write path"),
   .film_grain_test = ARG_DEF(
       NULL, "film-grain-test", 1,
       "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
@@ -592,7 +599,9 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
               "pyramid. Selected automatically from --cq-level if "
               "--fixed-qp-offsets is not provided. If this option is not "
               "specified (default), offsets are adaptively chosen by the "
-              "encoder."),
+              "encoder. Further, if this option is specified, at least two "
+              "comma-separated values corresponding to kf and arf offsets "
+              "must be provided, while the rest are chosen by the encoder"),
 
   .fixed_qp_offsets = ARG_DEF(
       NULL, "fixed-qp-offsets", 1,
@@ -605,6 +614,6 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
   .vbr_corpus_complexity_lap = ARG_DEF(
       NULL, "vbr-corpus-complexity-lap", 1,
       "Set average corpus complexity per mb for single pass VBR using lap. "
-      "(0..10000), default is 0")
+      "(0..10000), default is 0"),
 #endif  // CONFIG_AV1_ENCODER
 };
diff --git a/third_party/libaom/source/libaom/av1/arg_defs.h b/third_party/libaom/source/libaom/av1/arg_defs.h
index f86e91551c..6a8d0d47cf 100644
--- a/third_party/libaom/source/libaom/av1/arg_defs.h
+++ b/third_party/libaom/source/libaom/av1/arg_defs.h
@@ -173,12 +173,14 @@ typedef struct av1_codec_arg_definitions {
   arg_def_t coeff_cost_upd_freq;
   arg_def_t mode_cost_upd_freq;
   arg_def_t mv_cost_upd_freq;
+  arg_def_t dv_cost_upd_freq;
   arg_def_t num_tg;
   arg_def_t mtu_size;
   arg_def_t timing_info;
 #if CONFIG_TUNE_VMAF
   arg_def_t vmaf_model_path;
 #endif
+  arg_def_t partition_info_path;
   arg_def_t film_grain_test;
   arg_def_t film_grain_table;
 #if CONFIG_DENOISE
diff --git a/third_party/libaom/source/libaom/av1/av1_cx_iface.c b/third_party/libaom/source/libaom/av1/av1_cx_iface.c
index 123bb1dc41..11c47bca24 100644
--- a/third_party/libaom/source/libaom/av1/av1_cx_iface.c
+++ b/third_party/libaom/source/libaom/av1/av1_cx_iface.c
@@ -26,6 +26,7 @@
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/ethread.h"
+#include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/arg_defs.h"
 
@@ -51,6 +52,7 @@ struct av1_extracfg {
   unsigned int gf_max_pyr_height;
   aom_tune_metric tuning;
   const char *vmaf_model_path;
+  const char *partition_info_path;
   unsigned int cq_level;  // constrained quality level
   unsigned int rc_max_intra_bitrate_pct;
   unsigned int rc_max_inter_bitrate_pct;
@@ -154,12 +156,26 @@ struct av1_extracfg {
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
   COST_UPDATE_TYPE mv_cost_upd_freq;
+  COST_UPDATE_TYPE dv_cost_upd_freq;
   unsigned int ext_tile_debug;
   unsigned int sb_multipass_unit_test;
 };
 
+#if CONFIG_REALTIME_ONLY
+// Settings changed for realtime only build:
+// cpu_used: 7
+// enable_tpl_model: 0
+// enable_restoration: 0
+// enable_obmc: 0
+// deltaq_mode: NO_DELTA_Q
+// enable_global_motion usage: 0
+// enable_warped_motion at sequence level: 0
+// allow_warped_motion at frame level: 0
+// coeff_cost_upd_freq: COST_UPD_OFF
+// mode_cost_upd_freq: COST_UPD_OFF
+// mv_cost_upd_freq: COST_UPD_OFF
 static struct av1_extracfg default_extra_cfg = {
-  0,              // cpu_used
+  7,              // cpu_used
   1,              // enable_auto_alt_ref
   0,              // enable_auto_bwd_ref
   0,              // noise_sensitivity
@@ -168,7 +184,7 @@ static struct av1_extracfg default_extra_cfg = {
   1,              // row_mt
   0,              // tile_columns
   0,              // tile_rows
-  1,              // enable_tpl_model
+  0,              // enable_tpl_model
   1,              // enable_keyframe_filtering
   7,              // arnr_max_frames
   5,              // arnr_strength
@@ -177,31 +193,32 @@ static struct av1_extracfg default_extra_cfg = {
   0,              // gf_min_pyr_height
   5,              // gf_max_pyr_height
   AOM_TUNE_PSNR,  // tuning
-  "/usr/local/share/model/vmaf_v0.6.1.pkl",  // VMAF model path
-  10,                                        // cq_level
-  0,                                         // rc_max_intra_bitrate_pct
-  0,                                         // rc_max_inter_bitrate_pct
-  0,                                         // gf_cbr_boost_pct
-  0,                                         // lossless
-  1,                                         // enable_cdef
-  1,                                         // enable_restoration
-  0,                                         // force_video_mode
-  1,                                         // enable_obmc
-  3,                                         // disable_trellis_quant
-  0,                                         // enable_qm
-  DEFAULT_QM_Y,                              // qm_y
-  DEFAULT_QM_U,                              // qm_u
-  DEFAULT_QM_V,                              // qm_v
-  DEFAULT_QM_FIRST,                          // qm_min
-  DEFAULT_QM_LAST,                           // qm_max
-  1,                                         // max number of tile groups
-  0,                                         // mtu_size
+  "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
+  ".",                                        // partition info path
+  10,                                         // cq_level
+  0,                                          // rc_max_intra_bitrate_pct
+  0,                                          // rc_max_inter_bitrate_pct
+  0,                                          // gf_cbr_boost_pct
+  0,                                          // lossless
+  1,                                          // enable_cdef
+  0,                                          // enable_restoration
+  0,                                          // force_video_mode
+  0,                                          // enable_obmc
+  3,                                          // disable_trellis_quant
+  0,                                          // enable_qm
+  DEFAULT_QM_Y,                               // qm_y
+  DEFAULT_QM_U,                               // qm_u
+  DEFAULT_QM_V,                               // qm_v
+  DEFAULT_QM_FIRST,                           // qm_min
+  DEFAULT_QM_LAST,                            // qm_max
+  1,                                          // max number of tile groups
+  0,                                          // mtu_size
   AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
   0,                            // frame_parallel_decoding_mode
   1,                            // enable dual filter
   0,                            // enable delta quant in chroma planes
   NO_AQ,                        // aq_mode
-  DELTA_Q_OBJECTIVE,            // deltaq_mode
+  NO_DELTA_Q,                   // deltaq_mode
   0,                            // delta lf mode
   0,                            // frame_periodic_boost
   AOM_BITS_8,                   // Bit depth
@@ -243,9 +260,9 @@ static struct av1_extracfg default_extra_cfg = {
   1,                            // enable difference-weighted compound
   1,                            // enable interinter wedge compound
   1,                            // enable interintra wedge compound
-  1,                            // enable_global_motion usage
-  1,                            // enable_warped_motion at sequence level
-  1,                            // allow_warped_motion at frame level
+  0,                            // enable_global_motion usage
+  0,                            // enable_warped_motion at sequence level
+  0,                            // allow_warped_motion at frame level
   1,                            // enable filter intra at sequence level
   1,                            // enable smooth intra modes usage for sequence
   1,                            // enable Paeth intra mode usage for sequence
@@ -277,15 +294,148 @@ static struct av1_extracfg default_extra_cfg = {
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+  },             // target_seq_level_idx
+  0,             // tier_mask
+  0,             // min_cr
+  COST_UPD_OFF,  // coeff_cost_upd_freq
+  COST_UPD_OFF,  // mode_cost_upd_freq
+  COST_UPD_OFF,  // mv_cost_upd_freq
+  COST_UPD_OFF,  // dv_cost_upd_freq
+  0,             // ext_tile_debug
+  0,             // sb_multipass_unit_test
+};
+#else
+static struct av1_extracfg default_extra_cfg = {
+  0,              // cpu_used
+  1,              // enable_auto_alt_ref
+  0,              // enable_auto_bwd_ref
+  0,              // noise_sensitivity
+  0,              // sharpness
+  0,              // static_thresh
+  1,              // row_mt
+  0,              // tile_columns
+  0,              // tile_rows
+  1,              // enable_tpl_model
+  1,              // enable_keyframe_filtering
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  0,              // gf_min_pyr_height
+  5,              // gf_max_pyr_height
+  AOM_TUNE_PSNR,  // tuning
+  "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
+  ".",                                        // partition info path
+  10,                                         // cq_level
+  0,                                          // rc_max_intra_bitrate_pct
+  0,                                          // rc_max_inter_bitrate_pct
+  0,                                          // gf_cbr_boost_pct
+  0,                                          // lossless
+  1,                                          // enable_cdef
+  1,                                          // enable_restoration
+  0,                                          // force_video_mode
+  1,                                          // enable_obmc
+  3,                                          // disable_trellis_quant
+  0,                                          // enable_qm
+  DEFAULT_QM_Y,                               // qm_y
+  DEFAULT_QM_U,                               // qm_u
+  DEFAULT_QM_V,                               // qm_v
+  DEFAULT_QM_FIRST,                           // qm_min
+  DEFAULT_QM_LAST,                            // qm_max
+  1,                                          // max number of tile groups
+  0,                                          // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  0,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  0,                            // enable delta quant in chroma planes
+  NO_AQ,                        // aq_mode
+  DELTA_Q_OBJECTIVE,            // deltaq_mode
+  0,                            // delta lf mode
+  0,                            // frame_periodic_boost
+  AOM_BITS_8,                   // Bit depth
+  AOM_CONTENT_DEFAULT,          // content
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
+  AOM_CSP_UNKNOWN,              // chroma sample position
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
+  1,                            // this depends on large_scale_tile.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  0,                            // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+  1,                            // CDF update mode
+  1,                            // enable rectangular partitions
+  1,                            // enable ab shape partitions
+  1,                            // enable 1:4 and 4:1 partitions
+  4,                            // min_partition_size
+  128,                          // max_partition_size
+  1,                            // enable intra edge filter
+  1,                            // frame order hint
+  1,                            // enable 64-pt transform usage
+  1,                            // enable flip and identity transform
+  1,                            // enable rectangular transform usage
+  1,                            // dist-wtd compound
+  7,                            // max_reference_frames
+  0,                            // enable_reduced_reference_set
+  1,                            // enable_ref_frame_mvs sequence level
+  1,                            // allow ref_frame_mvs frame level
+  1,                            // enable masked compound at sequence level
+  1,                            // enable one sided compound at sequence level
+  1,                            // enable interintra compound at sequence level
+  1,                            // enable smooth interintra mode
+  1,                            // enable difference-weighted compound
+  1,                            // enable interinter wedge compound
+  1,                            // enable interintra wedge compound
+  1,                            // enable_global_motion usage
+  1,                            // enable_warped_motion at sequence level
+  1,                            // allow_warped_motion at frame level
+  1,                            // enable filter intra at sequence level
+  1,                            // enable smooth intra modes usage for sequence
+  1,                            // enable Paeth intra mode usage for sequence
+  1,                            // enable CFL uv intra mode usage for sequence
+  1,                       // enable D45 to D203 intra mode usage for sequence
+  1,                       // superres
+  1,                       // enable overlay
+  1,                       // enable palette
+  !CONFIG_SHARP_SETTINGS,  // enable intrabc
+  1,                       // enable angle delta
+#if CONFIG_DENOISE
+  0,                       // noise_level
+  32,                      // noise_block_size
+  1,                       // enable_dnl_denoising
+#endif
+  0,                       // chroma_subsampling_x
+  0,                       // chroma_subsampling_y
+  0,                       // reduced_tx_type_set
+  0,                       // use_intra_dct_only
+  0,                       // use_inter_dct_only
+  0,                       // use_intra_default_tx_only
+  0,                       // quant_b_adapt
+  0,                       // vbr_corpus_complexity_lap
+  {
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
   },            // target_seq_level_idx
   0,            // tier_mask
   0,            // min_cr
   COST_UPD_SB,  // coeff_cost_upd_freq
   COST_UPD_SB,  // mode_cost_upd_freq
   COST_UPD_SB,  // mv_cost_upd_freq
+  COST_UPD_SB,  // dv_cost_upd_freq
   0,            // ext_tile_debug
   0,            // sb_multipass_unit_test
 };
+#endif
 
 struct aom_codec_alg_priv {
   aom_codec_priv_t base;
@@ -380,7 +530,11 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1);
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
-  RANGE_CHECK_HI(cfg, g_usage, 2);
+#if CONFIG_REALTIME_ONLY
+  RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME);
+#else
+  RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA);
+#endif
   RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -540,15 +694,6 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   }
 #endif
 
-#if !CONFIG_USE_VMAF_RC
-  if (extra_cfg->tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
-    ERROR(
-        "This error may be related to the wrong configuration options: try to "
-        "set -DCONFIG_TUNE_VMAF=1 and -DCONFIG_USE_VMAF_RC=1 at the time CMake"
-        " is run.");
-  }
-#endif
-
   RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_BUTTERAUGLI);
 
   RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
@@ -572,6 +717,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3);
   RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3);
   RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
+  RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3);
 
   RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
   RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
@@ -619,13 +765,14 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
 
 #if CONFIG_TUNE_BUTTERAUGLI
   if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
-    if (img->x_chroma_shift != 1 || img->y_chroma_shift != 1) {
-      ERROR("Only YV12/I420 images supported in tune=butteraugli mode.");
+    if (img->bit_depth > 8) {
+      ERROR("Only 8 bit depth images supported in tune=butteraugli mode.");
     }
-    if ((img->cp != 0 && img->cp != AOM_CICP_CP_BT_709) ||
-        (img->tc != 0 && img->tc != AOM_CICP_TC_BT_709) ||
-        (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709)) {
-      ERROR("Only BT.709 images supported in tune=butteraugli mode.");
+    if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 &&
+        img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) {
+      ERROR(
+          "Only BT.709 and BT.601 matrix coefficients supported in "
+          "tune=butteraugli mode. Identity matrix is treated as BT.601.");
     }
   }
 #endif
@@ -689,7 +836,6 @@ static void update_default_encoder_config(const cfg_options_t *cfg,
   extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0);
   extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0);
   extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0);
-  extra_cfg->enable_diagonal_intra = (cfg->disable_diagonal_intra == 0);
   extra_cfg->enable_obmc = (cfg->disable_obmc == 0);
   extra_cfg->enable_palette = (cfg->disable_palette == 0);
   extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0);
@@ -709,12 +855,12 @@ static double convert_qp_offset(int cq_level, int q_offset, int bit_depth) {
   return (base_q_val - new_q_val);
 }
 
-static double get_modeled_qp_offset(int cq_level, int level, int bit_depth) {
-  // 80% for keyframe was derived empirically.
-  // 40% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF.
+static double get_modeled_qp_offset(int qp, int level, int bit_depth) {
+  // 76% for keyframe was derived empirically.
+  // 60% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF.
   // Rest derived similar to rc_pick_q_and_bounds_two_pass()
-  static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8 };
-  const double q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
+  static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8, 4 };
+  const double q_val = av1_convert_qindex_to_q(qp, bit_depth);
   return q_val * percents[level] / 100;
 }
 
@@ -916,6 +1062,7 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
   oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
   oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
   oxcf->cost_upd_freq.mv = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
+  oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq;
 
   // Set frame resize mode configuration.
   resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
@@ -1044,7 +1191,7 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
   oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc;
   oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion;
   oxcf->motion_mode_cfg.allow_warped_motion =
-      (cfg->g_usage == AOM_USAGE_REALTIME)
+      (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7)
           ? false
           : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
 
@@ -1141,6 +1288,8 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
          sizeof(oxcf->target_seq_level_idx));
   oxcf->tier_mask = extra_cfg->tier_mask;
 
+  oxcf->partition_info_path = extra_cfg->partition_info_path;
+
   return AOM_CODEC_OK;
 }
 
@@ -1179,10 +1328,20 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
     // On profile change, request a key frame
-    force_key |= ctx->ppi->cpi->common.seq_params.profile != ctx->oxcf.profile;
-    av1_change_config(ctx->ppi->cpi, &ctx->oxcf);
+    force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile;
+    bool is_sb_size_changed = false;
+    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                        is_sb_size_changed);
+    }
+#else
+    av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     if (ctx->ppi->cpi_lap != NULL) {
-      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf);
+      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
     }
   }
 
@@ -1192,7 +1351,7 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
 }
 
 static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
-  return av1_get_global_headers(ctx->ppi->cpi);
+  return av1_get_global_headers(ctx->ppi);
 }
 
 static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
@@ -1215,7 +1374,7 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  *arg = ctx->ppi->cpi->rc.baseline_gf_interval;
+  *arg = ctx->ppi->p_rc.baseline_gf_interval;
   return AOM_CODEC_OK;
 }
 
@@ -1225,9 +1384,19 @@ static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
   if (res == AOM_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    av1_change_config(ctx->ppi->cpi, &ctx->oxcf);
+    bool is_sb_size_changed = false;
+    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                        is_sb_size_changed);
+    }
+#else
+    av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     if (ctx->ppi->cpi_lap != NULL) {
-      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf);
+      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
     }
   }
   return res;
@@ -1299,7 +1468,13 @@ static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_tpl_model = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+  const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+#if CONFIG_REALTIME_ONLY
+  if (tpl_model_arg) {
+    ERROR("TPL model can't be turned on in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_tpl_model = tpl_model_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1379,7 +1554,13 @@ static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
                                                    va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+#if CONFIG_REALTIME_ONLY
+  if (restoration_arg) {
+    ERROR("Restoration can't be turned on in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_restoration = restoration_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1393,7 +1574,13 @@ static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args);
+  const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args);
+#if CONFIG_REALTIME_ONLY
+  if (obmc_arg) {
+    ERROR("OBMC can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_obmc = obmc_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1637,14 +1824,26 @@ static aom_codec_err_t ctrl_set_enable_interintra_wedge(
 static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+  const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+  if (global_motion_arg) {
+    ERROR("Global motion can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_global_motion = global_motion_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
 static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+  if (warped_motion_arg) {
+    ERROR("Warped motion can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_warped_motion = warped_motion_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1825,6 +2024,13 @@ static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1832,6 +2038,13 @@ static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.partition_info_path = CAST(AV1E_SET_PARTITION_INFO_PATH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_film_grain_test_vector(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1890,7 +2103,13 @@ static aom_codec_err_t ctrl_set_enable_dnl_denoising(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+  const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args);
+#if CONFIG_REALTIME_ONLY
+  if (deltaq_arg > NO_DELTA_Q) {
+    ERROR("Delta Q mode can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.deltaq_mode = deltaq_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1986,6 +2205,18 @@ static aom_codec_err_t ctrl_enable_sb_multipass_unit_test(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  AV1_COMP *const cpi = ctx->ppi->cpi;
+  aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args);
+  aom_ext_part_config_t config;
+  // TODO(chengchen): verify the sb_size has been set at this point.
+  config.superblock_size = cpi->common.seq_params->sb_size;
+  const aom_codec_err_t status =
+      av1_ext_part_create(funcs, config, &cpi->ext_part_controller);
+  return status;
+}
+
 #if !CONFIG_REALTIME_ONLY
 static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
                                            STATS_BUFFER_CTX *stats_buf_context,
@@ -2014,27 +2245,22 @@ static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
 
 static aom_codec_err_t create_context_and_bufferpool(
     AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool,
-    AV1EncoderConfig *oxcf, struct aom_codec_pkt_list *pkt_list_head,
-    FIRSTPASS_STATS *frame_stats_buf, COMPRESSOR_STAGE stage,
-    int num_lap_buffers, int lap_lag_in_frames,
-    STATS_BUFFER_CTX *stats_buf_context) {
+    AV1EncoderConfig *oxcf, COMPRESSOR_STAGE stage, int lap_lag_in_frames) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
-  *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
-  if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+  if (*p_buffer_pool == NULL) {
+    *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+    if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
 
 #if CONFIG_MULTITHREAD
-  if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
-    return AOM_CODEC_MEM_ERROR;
-  }
+    if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
+      return AOM_CODEC_MEM_ERROR;
+    }
 #endif
-  *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, frame_stats_buf,
-                                 stage, num_lap_buffers, lap_lag_in_frames,
-                                 stats_buf_context);
-  if (*p_cpi == NULL)
-    res = AOM_CODEC_MEM_ERROR;
-  else
-    (*p_cpi)->output_pkt_list = pkt_list_head;
+  }
+  *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, stage,
+                                 lap_lag_in_frames);
+  if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR;
 
   return res;
 }
@@ -2084,27 +2310,48 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 
-      priv->ppi = av1_create_primary_compressor();
+      priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head,
+                                                *num_lap_buffers, &priv->oxcf);
       if (!priv->ppi) return AOM_CODEC_MEM_ERROR;
 
 #if !CONFIG_REALTIME_ONLY
       res = create_stats_buffer(&priv->frame_stats_buffer,
                                 &priv->stats_buf_context, *num_lap_buffers);
       if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+
+      assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
+      int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS);
+      for (int i = 0; i < size; i++)
+        priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i];
+
+      priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context;
+      priv->ppi->twopass.stats_in =
+          priv->ppi->twopass.stats_buf_ctx->stats_in_start;
 #endif
 
-      res = create_context_and_bufferpool(
-          priv->ppi, &priv->ppi->cpi, &priv->buffer_pool, &priv->oxcf,
-          &priv->pkt_list.head, priv->frame_stats_buffer, ENCODE_STAGE,
-          *num_lap_buffers, -1, &priv->stats_buf_context);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      assert(priv->ppi->num_fp_contexts >= 1);
+      int i;
+      for (i = 0; i < priv->ppi->num_fp_contexts; i++) {
+        res = create_context_and_bufferpool(
+            priv->ppi, &priv->ppi->parallel_cpi[i], &priv->buffer_pool,
+            &priv->oxcf, ENCODE_STAGE, -1);
+        if (res != AOM_CODEC_OK) {
+          return res;
+        }
+      }
+      priv->ppi->cpi = priv->ppi->parallel_cpi[0];
+#else
+      res = create_context_and_bufferpool(priv->ppi, &priv->ppi->cpi,
+                                          &priv->buffer_pool, &priv->oxcf,
+                                          ENCODE_STAGE, -1);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
       // Create another compressor if look ahead is enabled
       if (res == AOM_CODEC_OK && *num_lap_buffers) {
         res = create_context_and_bufferpool(
             priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf,
-            NULL, priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers,
-            clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS),
-            &priv->stats_buf_context);
+            LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS));
       }
     }
   }
@@ -2113,12 +2360,16 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
 }
 
 static void destroy_context_and_bufferpool(AV1_COMP *cpi,
-                                           BufferPool *buffer_pool) {
+                                           BufferPool **p_buffer_pool) {
   av1_remove_compressor(cpi);
+  if (*p_buffer_pool) {
+    av1_free_ref_frame_buffers(*p_buffer_pool);
 #if CONFIG_MULTITHREAD
-  if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex);
+    pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex);
 #endif
-  aom_free(buffer_pool);
+    aom_free(*p_buffer_pool);
+    *p_buffer_pool = NULL;
+  }
 }
 
 static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
@@ -2133,9 +2384,30 @@ static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
 
   if (ctx->ppi) {
     AV1_PRIMARY *ppi = ctx->ppi;
-    destroy_context_and_bufferpool(ppi->cpi, ctx->buffer_pool);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    for (int i = 0; i < ppi->num_fp_contexts - 1; i++) {
+      if (ppi->parallel_frames_data[i].cx_data_frame) {
+        free(ppi->parallel_frames_data[i].cx_data_frame);
+      }
+    }
+#endif
+#if CONFIG_ENTROPY_STATS
+    print_entropy_stats(ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+    print_internal_stats(ppi);
+#endif
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ppi->num_fp_contexts; i++) {
+      destroy_context_and_bufferpool(ppi->parallel_cpi[i], &ctx->buffer_pool);
+    }
+    ppi->cpi = NULL;
+#else
+    destroy_context_and_bufferpool(ppi->cpi, &ctx->buffer_pool);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     if (ppi->cpi_lap) {
-      destroy_context_and_bufferpool(ppi->cpi_lap, ctx->buffer_pool_lap);
+      destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap);
     }
     av1_remove_primary_compressor(ppi);
   }
@@ -2151,7 +2423,7 @@ static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
   aom_codec_frame_flags_t flags = lib_flags << 16;
 
   if (lib_flags & FRAMEFLAGS_KEY ||
-      (cpi->use_svc &&
+      (cpi->ppi->use_svc &&
        svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
                           svc->temporal_layer_id]
            .is_key_frame))
@@ -2182,7 +2454,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   AV1_COMP *cpi_lap = ppi->cpi_lap;
   if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
 
-  if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0)
+  if (cpi->ppi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0)
     return AOM_CODEC_INVALID_PARAM;
 
   if (img != NULL) {
@@ -2216,6 +2488,22 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
           return AOM_CODEC_MEM_ERROR;
         }
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      for (int i = 0; i < cpi->ppi->num_fp_contexts - 1; i++) {
+        if (cpi->ppi->parallel_frames_data[i].cx_data_frame == NULL) {
+          cpi->ppi->parallel_frames_data[i].cx_data_sz = uncompressed_frame_sz;
+          cpi->ppi->parallel_frames_data[i].frame_display_order_hint = -1;
+          cpi->ppi->parallel_frames_data[i].frame_size = 0;
+          cpi->ppi->parallel_frames_data[i].cx_data_frame =
+              (unsigned char *)malloc(
+                  cpi->ppi->parallel_frames_data[i].cx_data_sz);
+          if (cpi->ppi->parallel_frames_data[i].cx_data_frame == NULL) {
+            cpi->ppi->parallel_frames_data[i].cx_data_sz = 0;
+            return AOM_CODEC_MEM_ERROR;
+          }
+        }
+      }
+#endif
     }
   }
 
@@ -2226,22 +2514,16 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cpi->common.error.jmp)) {
-    cpi->common.error.setjmp = 0;
-    res = update_error_state(ctx, &cpi->common.error);
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    res = update_error_state(ctx, &ppi->error);
     aom_clear_system_state();
     return res;
   }
-  cpi->common.error.setjmp = 1;
-  if (cpi_lap != NULL) {
-    if (setjmp(cpi_lap->common.error.jmp)) {
-      cpi_lap->common.error.setjmp = 0;
-      res = update_error_state(ctx, &cpi_lap->common.error);
-      aom_clear_system_state();
-      return res;
-    }
-    cpi_lap->common.error.setjmp = 1;
-  }
+  ppi->error.setjmp = 1;
+
+  if (cpi->ppi->use_svc && cpi->svc.use_flexible_mode == 0 && flags == 0)
+    av1_set_svc_fixed_mode(cpi);
 
   // Note(yunqing): While applying encoding flags, always start from enabling
   // all, and then modifying according to the flags. Previous frame's flags are
@@ -2251,9 +2533,12 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     av1_apply_encoding_flags(cpi_lap, flags);
   }
 
-#if CONFIG_USE_VMAF_RC
-  aom_init_vmaf_model_rc(&cpi->vmaf_info.vmaf_model,
-                         cpi->oxcf.tune_cfg.vmaf_model_path);
+#if CONFIG_TUNE_VMAF
+  if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+      ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    aom_init_vmaf_model(&cpi->vmaf_info.vmaf_model,
+                        cpi->oxcf.tune_cfg.vmaf_model_path);
+  }
 #endif
 
   // Handle fixed keyframe intervals
@@ -2270,7 +2555,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
 
   if (res == AOM_CODEC_OK) {
     // Set up internal flags
-    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR)
+      cpi->ppi->b_calculate_psnr = 1;
 
     if (img != NULL) {
       if (!ctx->pts_offset_initialized) {
@@ -2306,11 +2592,18 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
             cpi->oxcf.tool_cfg.enable_global_motion);
       }
       if (!ppi->lookahead)
-        aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate lag buffers");
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      int i;
+      for (i = 0; i < ppi->num_fp_contexts; i++) {
+        av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth,
+                                subsampling_x, subsampling_y);
+      }
+#else
       av1_check_initial_width(cpi, use_highbitdepth, subsampling_x,
                               subsampling_y);
+#endif
       if (cpi_lap != NULL) {
         av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x,
                                 subsampling_y);
@@ -2320,7 +2613,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       // key frame flag when we actually encode this frame.
       if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
                                 src_time_stamp, src_end_time_stamp)) {
-        res = update_error_state(ctx, &cpi->common.error);
+        res = update_error_state(ctx, &ppi->error);
       }
       ctx->next_frame_flags = 0;
     }
@@ -2337,7 +2630,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
        * the buffer size anyway.
        */
       if (cx_data_sz < ctx->cx_data_sz / 2) {
-        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
                            "Compressed data buffer too small");
       }
     }
@@ -2358,6 +2651,12 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     }
     if ((num_workers > 1) && (cpi->mt_info.num_workers == 0)) {
       av1_create_workers(cpi, num_workers);
+#if CONFIG_MULTITHREAD
+      av1_init_mt_sync(cpi, cpi->oxcf.pass == 1);
+      if (cpi_lap != NULL) {
+        av1_init_mt_sync(cpi_lap, 1);
+      }
+#endif  // CONFIG_MULTITHREAD
       if (cpi->oxcf.pass != 1) {
         av1_create_second_pass_workers(cpi, num_workers);
       }
@@ -2373,13 +2672,12 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       }
       cpi_lap->mt_info.num_workers = cpi->mt_info.num_workers;
       const int status = av1_get_compressed_data(
-          cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
-          &dst_end_time_stamp_la, !img, timestamp_ratio);
+          cpi_lap, &lib_flags, &frame_size, cx_data_sz, NULL,
+          &dst_time_stamp_la, &dst_end_time_stamp_la, !img, timestamp_ratio);
       if (status != -1) {
         if (status != AOM_CODEC_OK) {
-          aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
-        cpi_lap->seq_params_locked = 1;
       }
       lib_flags = 0;
       frame_size = 0;
@@ -2390,15 +2688,39 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     int64_t dst_time_stamp;
     int64_t dst_end_time_stamp;
     while (cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      cpi->do_frame_data_update = true;
+      if (ppi->num_fp_contexts > 1 && ppi->gf_group.size > 1) {
+        if (cpi->gf_frame_index < ppi->gf_group.size) {
+          calc_frame_data_update_flag(&ppi->gf_group, cpi->gf_frame_index,
+                                      &cpi->do_frame_data_update);
+        }
+      }
+#endif
       const int status = av1_get_compressed_data(
-          cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp,
+          cpi, &lib_flags, &frame_size, cx_data_sz, cx_data, &dst_time_stamp,
           &dst_end_time_stamp, !img, timestamp_ratio);
       if (status == -1) break;
       if (status != AOM_CODEC_OK) {
-        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
       }
 
-      cpi->seq_params_locked = 1;
+#if CONFIG_ENTROPY_STATS
+      if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame)
+        av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts);
+#endif
+#if CONFIG_INTERNAL_STATS
+      if (ppi->cpi->oxcf.pass != 1) {
+        ppi->total_time_compress_data += cpi->time_compress_data;
+        ppi->total_recode_hits += cpi->frame_recode_hits;
+        ppi->total_bytes += cpi->bytes;
+        for (int i = 0; i < MAX_MODES; i++) {
+          ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
+        }
+      }
+#endif  // CONFIG_INTERNAL_STATS
+
+      cpi->ppi->seq_params_locked = 1;
       if (!frame_size) continue;
       assert(cx_data != NULL && cx_data_sz != 0);
       const int write_temporal_delimiter =
@@ -2413,12 +2735,13 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
         const size_t move_offset = obu_header_size + length_field_size;
         memmove(ctx->cx_data + move_offset, ctx->cx_data, frame_size);
         obu_header_size = av1_write_obu_header(
-            &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
+            &cpi->ppi->level_params, &cpi->frame_header_count,
+            OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
 
         // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
                                     ctx->cx_data) != AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
 
         frame_size += obu_header_size + obu_payload_size + length_field_size;
@@ -2428,7 +2751,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
         size_t curr_frame_size = frame_size;
         if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
             AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
         frame_size = curr_frame_size;
 
@@ -2437,7 +2760,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
         memmove(cx_data + length_field_size, cx_data, frame_size);
         if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
             AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
         frame_size += length_field_size;
       }
@@ -2458,7 +2781,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       aom_codec_cx_pkt_t pkt;
 
       // decrement frames_left counter
-      cpi->frames_left = AOMMAX(0, cpi->frames_left - 1);
+      cpi->ppi->frames_left = AOMMAX(0, cpi->ppi->frames_left - 1);
       if (ctx->oxcf.save_as_annexb) {
         //  B_PRIME (add TU size)
         size_t tu_size = ctx->pending_cx_data_sz;
@@ -2466,7 +2789,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
         memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size);
         if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, ctx->cx_data) !=
             AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
         ctx->pending_cx_data_sz += length_field_size;
       }
@@ -2496,7 +2819,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     }
   }
 
-  cpi->common.error.setjmp = 0;
+  ppi->error.setjmp = 0;
   return res;
 }
 
@@ -2674,7 +2997,7 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
   const int number_spatial_layers = va_arg(args, int);
   if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
-  ctx->ppi->cpi->common.number_spatial_layers = number_spatial_layers;
+  ctx->ppi->number_spatial_layers = number_spatial_layers;
   return AOM_CODEC_OK;
 }
 
@@ -2690,19 +3013,20 @@ static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx,
 
 static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
-  AV1_COMP *const cpi = ctx->ppi->cpi;
+  AV1_PRIMARY *const ppi = ctx->ppi;
+  AV1_COMP *const cpi = ppi->cpi;
   AV1_COMMON *const cm = &cpi->common;
   aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
-  cm->number_spatial_layers = params->number_spatial_layers;
-  cm->number_temporal_layers = params->number_temporal_layers;
+  ppi->number_spatial_layers = params->number_spatial_layers;
+  ppi->number_temporal_layers = params->number_temporal_layers;
   cpi->svc.number_spatial_layers = params->number_spatial_layers;
   cpi->svc.number_temporal_layers = params->number_temporal_layers;
-  if (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1) {
+  if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) {
     unsigned int sl, tl;
-    cpi->use_svc = 1;
-    for (sl = 0; sl < cm->number_spatial_layers; ++sl) {
-      for (tl = 0; tl < cm->number_temporal_layers; ++tl) {
-        const int layer = LAYER_IDS_TO_IDX(sl, tl, cm->number_temporal_layers);
+    ctx->ppi->use_svc = 1;
+    for (sl = 0; sl < ppi->number_spatial_layers; ++sl) {
+      for (tl = 0; tl < ppi->number_temporal_layers; ++tl) {
+        const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers);
         LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
         lc->max_q = params->max_quantizers[layer];
         lc->min_q = params->min_quantizers[layer];
@@ -2713,11 +3037,11 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
       }
     }
     if (cm->current_frame.frame_number == 0) {
-      if (!cpi->seq_params_locked) {
-        SequenceHeader *const seq_params = &cm->seq_params;
+      if (!cpi->ppi->seq_params_locked) {
+        SequenceHeader *const seq_params = &ppi->seq_params;
         seq_params->operating_points_cnt_minus_1 =
-            cm->number_spatial_layers * cm->number_temporal_layers - 1;
-        av1_init_seq_coding_tools(&cm->seq_params, cm, &cpi->oxcf, 1);
+            ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+        av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
       }
       av1_init_layer_context(cpi);
     }
@@ -2732,13 +3056,15 @@ static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx,
   AV1_COMP *const cpi = ctx->ppi->cpi;
   aom_svc_ref_frame_config_t *const data =
       va_arg(args, aom_svc_ref_frame_config_t *);
-  cpi->svc.external_ref_frame_config = 1;
+  cpi->svc.set_ref_frame_config = 1;
   for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     cpi->svc.reference[i] = data->reference[i];
     cpi->svc.ref_idx[i] = data->ref_idx[i];
   }
   for (unsigned int i = 0; i < REF_FRAMES; ++i)
     cpi->svc.refresh[i] = data->refresh[i];
+  cpi->svc.use_flexible_mode = 1;
+  cpi->svc.ksvc_fixed_mode = 0;
   return AOM_CODEC_OK;
 }
 
@@ -2831,18 +3157,17 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
   // Used to mock the argv with just one string "--{name}={value}"
   char *argv[2] = { NULL, "" };
   size_t len = strlen(name) + strlen(value) + 4;
-  char *err_string = ctx->ppi->cpi->common.error.detail;
+  char *err_string = ctx->ppi->error.detail;
 
 #if __STDC_VERSION__ >= 201112L
   // We use the keyword _Static_assert because clang-cl does not allow the
   // convenience macro static_assert to be used in function scope. See
   // https://bugs.llvm.org/show_bug.cgi?id=48904.
-  _Static_assert(
-      sizeof(ctx->ppi->cpi->common.error.detail) >= ARG_ERR_MSG_MAX_LEN,
-      "The size of the err_msg buffer for arg_match_helper must be "
-      "at least ARG_ERR_MSG_MAX_LEN");
+  _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN,
+                 "The size of the err_msg buffer for arg_match_helper must be "
+                 "at least ARG_ERR_MSG_MAX_LEN");
 #else
-  assert(sizeof(ctx->ppi->cpi->common.error.detail) >= ARG_ERR_MSG_MAX_LEN);
+  assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN);
 #endif
 
   argv[0] = aom_malloc(len * sizeof(argv[1][0]));
@@ -2909,8 +3234,11 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
     extra_cfg.vmaf_model_path = value;
   }
 #endif
-  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
-                            err_string)) {
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path,
+                            argv, err_string)) {
+    extra_cfg.partition_info_path = value;
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
+                              err_string)) {
     extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string);
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct,
                               argv, err_string)) {
@@ -3161,6 +3489,9 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq,
                               argv, err_string)) {
     extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq,
+                              argv, err_string)) {
+    extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
   }
 #if CONFIG_DENOISE
   else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level,
@@ -3215,9 +3546,8 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *const arg = va_arg(args, int *);
-  const AV1_COMP *const cpi = ctx->ppi->cpi;
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params,
+  return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params,
                                arg);
 }
 
@@ -3332,6 +3662,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
   { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path },
+  { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path },
   { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
   { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
   { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
@@ -3347,6 +3678,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
   { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap },
   { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
+  { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq },
+  { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -3364,6 +3697,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
 };
 
 static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
+#if !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_GOOD_QUALITY,  // g_usage - non-realtime usage
@@ -3415,25 +3749,26 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       2000,  // rc_two_pass_vbrmax_section
 
       // keyframing settings (kf)
-      0,                       // fwd_kf_enabled
-      AOM_KF_AUTO,             // kf_mode
-      0,                       // kf_min_dist
-      9999,                    // kf_max_dist
-      0,                       // sframe_dist
-      1,                       // sframe_mode
-      0,                       // large_scale_tile
-      0,                       // monochrome
-      0,                       // full_still_picture_hdr
-      0,                       // save_as_annexb
-      0,                       // tile_width_count
-      0,                       // tile_height_count
-      { 0 },                   // tile_widths
-      { 0 },                   // tile_heights
-      0,                       // use_fixed_qp_offsets
-      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      0,                           // fwd_kf_enabled
+      AOM_KF_AUTO,                 // kf_mode
+      0,                           // kf_min_dist
+      9999,                        // kf_max_dist
+      0,                           // sframe_dist
+      1,                           // sframe_mode
+      0,                           // large_scale_tile
+      0,                           // monochrome
+      0,                           // full_still_picture_hdr
+      0,                           // save_as_annexb
+      0,                           // tile_width_count
+      0,                           // tile_height_count
+      { 0 },                       // tile_widths
+      { 0 },                       // tile_heights
+      0,                           // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1, -1 },  // fixed_qp_offsets
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#endif  // !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_REALTIME,  // g_usage - real-time usage
@@ -3485,25 +3820,26 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       2000,  // rc_two_pass_vbrmax_section
 
       // keyframing settings (kf)
-      0,                       // fwd_kf_enabled
-      AOM_KF_AUTO,             // kf_mode
-      0,                       // kf_min_dist
-      9999,                    // kf_max_dist
-      0,                       // sframe_dist
-      1,                       // sframe_mode
-      0,                       // large_scale_tile
-      0,                       // monochrome
-      0,                       // full_still_picture_hdr
-      0,                       // save_as_annexb
-      0,                       // tile_width_count
-      0,                       // tile_height_count
-      { 0 },                   // tile_widths
-      { 0 },                   // tile_heights
-      0,                       // use_fixed_qp_offsets
-      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      0,                           // fwd_kf_enabled
+      AOM_KF_AUTO,                 // kf_mode
+      0,                           // kf_min_dist
+      9999,                        // kf_max_dist
+      0,                           // sframe_dist
+      1,                           // sframe_mode
+      0,                           // large_scale_tile
+      0,                           // monochrome
+      0,                           // full_still_picture_hdr
+      0,                           // save_as_annexb
+      0,                           // tile_width_count
+      0,                           // tile_height_count
+      { 0 },                       // tile_widths
+      { 0 },                       // tile_heights
+      0,                           // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1, -1 },  // fixed_qp_offsets
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#if !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_ALL_INTRA,  // g_usage - all intra usage
@@ -3572,8 +3908,9 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       0,                       // use_fixed_qp_offsets
       { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#endif  // !CONFIG_REALTIME_ONLY
 };
 
 // This data structure and function are exported in aom/aomcx.h
@@ -3598,13 +3935,13 @@ aom_codec_iface_t aom_codec_av1_cx_algo = {
   },
   {
       // NOLINT
-      3,                           // 3 cfg
-      encoder_usage_cfg,           // aom_codec_enc_cfg_t
-      encoder_encode,              // aom_codec_encode_fn_t
-      encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
-      encoder_set_config,          // aom_codec_enc_config_set_fn_t
-      encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
-      encoder_get_preview          // aom_codec_get_preview_frame_fn_t
+      NELEMENTS(encoder_usage_cfg),  // cfg_count
+      encoder_usage_cfg,             // aom_codec_enc_cfg_t
+      encoder_encode,                // aom_codec_encode_fn_t
+      encoder_get_cxdata,            // aom_codec_get_cx_data_fn_t
+      encoder_set_config,            // aom_codec_enc_config_set_fn_t
+      encoder_get_global_headers,    // aom_codec_get_global_headers_fn_t
+      encoder_get_preview            // aom_codec_get_preview_frame_fn_t
   },
   encoder_set_option  // aom_codec_set_option_fn_t
 };
diff --git a/third_party/libaom/source/libaom/av1/av1_dx_iface.c b/third_party/libaom/source/libaom/av1/av1_dx_iface.c
index 1ee8a576d3..02968abd16 100644
--- a/third_party/libaom/source/libaom/av1/av1_dx_iface.c
+++ b/third_party/libaom/source/libaom/av1/av1_dx_iface.c
@@ -115,14 +115,18 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
   if (ctx->frame_worker != NULL) {
     AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *const pbi = frame_worker_data->pbi;
     aom_get_worker_interface()->end(worker);
-    aom_free(frame_worker_data->pbi->common.tpl_mvs);
-    frame_worker_data->pbi->common.tpl_mvs = NULL;
+    aom_free(pbi->common.tpl_mvs);
+    pbi->common.tpl_mvs = NULL;
     av1_remove_common(&frame_worker_data->pbi->common);
+    av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync,
+                          pbi->num_workers);
+    av1_free_cdef_sync(&pbi->cdef_sync);
 #if !CONFIG_REALTIME_ONLY
-    av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+    av1_free_restoration_buffers(&pbi->common);
 #endif
-    av1_decoder_remove(frame_worker_data->pbi);
+    av1_decoder_remove(pbi);
     aom_free(frame_worker_data);
 #if CONFIG_MULTITHREAD
     pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
@@ -392,7 +396,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
     pool->release_fb_cb = av1_release_frame_buffer;
 
     if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to initialize internal frame buffers");
 
     pool->cb_priv = &pool->int_frame_buffers;
@@ -527,7 +531,7 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
   *data = frame_worker_data->data_end;
 
   if (worker->had_error)
-    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+    return update_error_state(ctx, &frame_worker_data->pbi->error);
 
   check_resync(ctx, frame_worker_data->pbi);
 
@@ -558,7 +562,7 @@ static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
   check_resync(ctx, frame_worker_data->pbi);
 
   if (ctx->frame_worker->had_error)
-    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+    return update_error_state(ctx, &frame_worker_data->pbi->error);
 
   // Allow extra zero bytes after the frame end
   while (data < data_end) {
@@ -823,7 +827,7 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
         aom_image_t *res =
             add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
         if (!res) {
-          aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Grain systhesis failed\n");
         }
         *index += 1;  // Advance the iterator to point to the next image
@@ -1091,10 +1095,9 @@ static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1Decoder *pbi = frame_worker_data->pbi;
-      still_picture_info->is_still_picture =
-          (int)pbi->common.seq_params.still_picture;
+      still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture;
       still_picture_info->is_reduced_still_picture_hdr =
-          (int)(pbi->common.seq_params.reduced_still_picture_hdr);
+          (int)(pbi->seq_params.reduced_still_picture_hdr);
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1112,7 +1115,7 @@ static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1Decoder *pbi = frame_worker_data->pbi;
-      if (pbi->common.seq_params.sb_size == BLOCK_128X128) {
+      if (pbi->seq_params.sb_size == BLOCK_128X128) {
         *sb_size = AOM_SUPERBLOCK_SIZE_128X128;
       } else {
         *sb_size = AOM_SUPERBLOCK_SIZE_64X64;
@@ -1291,7 +1294,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-      *bit_depth = cm->seq_params.bit_depth;
+      *bit_depth = cm->seq_params->bit_depth;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1327,9 +1330,9 @@ static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
 
-      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
-                                cm->seq_params.subsampling_y,
-                                cm->seq_params.use_highbitdepth);
+      *img_fmt = get_img_format(cm->seq_params->subsampling_x,
+                                cm->seq_params->subsampling_y,
+                                cm->seq_params->use_highbitdepth);
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
diff --git a/third_party/libaom/source/libaom/av1/common/alloccommon.c b/third_party/libaom/source/libaom/av1/common/alloccommon.c
index cd997cd875..8624255218 100644
--- a/third_party/libaom/source/libaom/av1/common/alloccommon.c
+++ b/third_party/libaom/source/libaom/av1/common/alloccommon.c
@@ -17,8 +17,10 @@
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/cdef_block.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
+#include "av1/common/thread_common.h"
 
 int av1_get_MBs(int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
@@ -51,6 +53,227 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
   }
 }
 
+static INLINE void free_cdef_linebuf_conditional(
+    AV1_COMMON *const cm, const size_t *new_linebuf_size) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) {
+      aom_free(cdef_info->linebuf[plane]);
+      cdef_info->linebuf[plane] = NULL;
+    }
+  }
+}
+
+static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm,
+                                              uint16_t **colbuf,
+                                              uint16_t **srcbuf,
+                                              const size_t *new_colbuf_size,
+                                              const size_t new_srcbuf_size) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) {
+    aom_free(*srcbuf);
+    *srcbuf = NULL;
+  }
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) {
+      aom_free(colbuf[plane]);
+      colbuf[plane] = NULL;
+    }
+  }
+}
+
+static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
+  aom_free(*srcbuf);
+  *srcbuf = NULL;
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    aom_free(colbuf[plane]);
+    colbuf[plane] = NULL;
+  }
+}
+
+static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt,
+                                      const int num_mi_rows) {
+  if (*cdef_row_mt == NULL) return;
+#if CONFIG_MULTITHREAD
+  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+    pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_);
+    pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_);
+    aom_free((*cdef_row_mt)[row_idx].row_mutex_);
+    aom_free((*cdef_row_mt)[row_idx].row_cond_);
+  }
+#else
+  (void)num_mi_rows;
+#endif  // CONFIG_MULTITHREAD
+  aom_free(*cdef_row_mt);
+  *cdef_row_mt = NULL;
+}
+
+void av1_free_cdef_buffers(AV1_COMMON *const cm,
+                           AV1CdefWorkerData **cdef_worker,
+                           AV1CdefSync *cdef_sync, int num_workers) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  const int num_mi_rows = cdef_info->allocated_mi_rows;
+
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    aom_free(cdef_info->linebuf[plane]);
+    cdef_info->linebuf[plane] = NULL;
+  }
+  // De-allocation of column buffer & source buffer (worker_0).
+  free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf);
+
+  if (num_workers < 2) return;
+  if (*cdef_worker != NULL) {
+    for (int idx = num_workers - 1; idx >= 1; idx--) {
+      // De-allocation of column buffer & source buffer for remaining workers.
+      free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+    }
+    aom_free(*cdef_worker);
+    *cdef_worker = NULL;
+  }
+  free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows);
+}
+
+static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
+                                      const int num_planes) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (linebuf[plane] == NULL)
+      CHECK_MEM_ERROR(cm, linebuf[plane],
+                      aom_malloc(cdef_info->allocated_linebuf_size[plane]));
+  }
+}
+
+static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
+                                   uint16_t **srcbuf, const int num_planes) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  if (*srcbuf == NULL)
+    CHECK_MEM_ERROR(cm, *srcbuf,
+                    aom_memalign(16, cdef_info->allocated_srcbuf_size));
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (colbuf[plane] == NULL)
+      CHECK_MEM_ERROR(cm, colbuf[plane],
+                      aom_malloc(cdef_info->allocated_colbuf_size[plane]));
+  }
+}
+
+static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm,
+                                       AV1CdefRowSync **cdef_row_mt,
+                                       const int num_mi_rows) {
+  if (*cdef_row_mt != NULL) return;
+
+  CHECK_MEM_ERROR(cm, *cdef_row_mt,
+                  aom_malloc(sizeof(**cdef_row_mt) * num_mi_rows));
+#if CONFIG_MULTITHREAD
+  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_,
+                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_)));
+    pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL);
+
+    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_,
+                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_)));
+    pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL);
+
+    (*cdef_row_mt)[row_idx].is_row_done = 0;
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_alloc_cdef_buffers(AV1_COMMON *const cm,
+                            AV1CdefWorkerData **cdef_worker,
+                            AV1CdefSync *cdef_sync, int num_workers) {
+  const int num_planes = av1_num_planes(cm);
+  size_t new_linebuf_size[MAX_MB_PLANE] = { 0 };
+  size_t new_colbuf_size[MAX_MB_PLANE] = { 0 };
+  size_t new_srcbuf_size = 0;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  // Check for configuration change
+  const int num_mi_rows =
+      (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int is_num_workers_changed =
+      cdef_info->allocated_num_workers != num_workers;
+  const int is_cdef_enabled =
+      cm->seq_params->enable_cdef && !cm->tiles.large_scale;
+
+  // num-bufs=3 represents ping-pong buffers for top linebuf,
+  // followed by bottom linebuf.
+  // ping-pong is to avoid top linebuf over-write by consecutive row.
+  int num_bufs = 3;
+  if (num_workers > 1)
+    num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+  if (is_cdef_enabled) {
+    // Calculate src buffer size
+    new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE;
+    for (int plane = 0; plane < num_planes; plane++) {
+      const int shift =
+          plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x;
+      // Calculate top and bottom line buffer size
+      const int luma_stride =
+          ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+      new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs *
+                                (CDEF_VBORDER << 1) * (luma_stride >> shift);
+      // Calculate column buffer size
+      const int block_height =
+          (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
+      new_colbuf_size[plane] =
+          sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER;
+    }
+  }
+
+  // Free src, line and column buffers for worker 0 in case of reallocation
+  free_cdef_linebuf_conditional(cm, new_linebuf_size);
+  free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf,
+                             new_colbuf_size, new_srcbuf_size);
+
+  if (*cdef_worker != NULL) {
+    if (is_num_workers_changed) {
+      // Free src and column buffers for remaining workers in case of change in
+      // num_workers
+      for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--)
+        free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+    } else if (num_workers > 1) {
+      // Free src and column buffers for remaining workers in case of
+      // reallocation
+      for (int idx = num_workers - 1; idx >= 1; idx--)
+        free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf,
+                                   &(*cdef_worker)[idx].srcbuf, new_colbuf_size,
+                                   new_srcbuf_size);
+    }
+  }
+
+  if (cdef_info->allocated_mi_rows != num_mi_rows)
+    free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows);
+
+  // Store allocated sizes for reallocation
+  cdef_info->allocated_srcbuf_size = new_srcbuf_size;
+  av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size);
+  av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size);
+  // Store configuration to check change in configuration
+  cdef_info->allocated_mi_rows = num_mi_rows;
+  cdef_info->allocated_num_workers = num_workers;
+
+  if (!is_cdef_enabled) return;
+
+  // Memory allocation of column buffer & source buffer (worker_0).
+  alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes);
+  alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes);
+
+  if (num_workers < 2) return;
+
+  if (*cdef_worker == NULL)
+    CHECK_MEM_ERROR(cm, *cdef_worker,
+                    aom_calloc(num_workers, sizeof(**cdef_worker)));
+
+  // Memory allocation of column buffer & source buffer for remaining workers.
+  for (int idx = num_workers - 1; idx >= 1; idx--)
+    alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf,
+                    num_planes);
+
+  alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt,
+                      cdef_info->allocated_mi_rows);
+}
+
 #if !CONFIG_REALTIME_ONLY
 // Assumes cm->rst_info[p].restoration_unit_size is already initialized
 void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
@@ -86,11 +309,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
-    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int ss_x = is_uv && cm->seq_params->subsampling_x;
     const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
     const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
     const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
diff --git a/third_party/libaom/source/libaom/av1/common/alloccommon.h b/third_party/libaom/source/libaom/av1/common/alloccommon.h
index e75c226831..0b43889d20 100644
--- a/third_party/libaom/source/libaom/av1/common/alloccommon.h
+++ b/third_party/libaom/source/libaom/av1/common/alloccommon.h
@@ -24,6 +24,8 @@ struct AV1Common;
 struct BufferPool;
 struct CommonContexts;
 struct CommonModeInfoParams;
+struct AV1CdefWorker;
+struct AV1CdefSyncData;
 
 void av1_remove_common(struct AV1Common *cm);
 
@@ -36,6 +38,12 @@ void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_cdef_buffers(struct AV1Common *const cm,
+                            struct AV1CdefWorker **cdef_worker,
+                            struct AV1CdefSyncData *cdef_sync, int num_workers);
+void av1_free_cdef_buffers(struct AV1Common *const cm,
+                           struct AV1CdefWorker **cdef_worker,
+                           struct AV1CdefSyncData *cdef_sync, int num_workers);
 #if !CONFIG_REALTIME_ONLY
 void av1_alloc_restoration_buffers(struct AV1Common *cm);
 void av1_free_restoration_buffers(struct AV1Common *cm);
diff --git a/third_party/libaom/source/libaom/av1/common/av1_common_int.h b/third_party/libaom/source/libaom/av1/common/av1_common_int.h
index 0a68cb5fd5..981a186579 100644
--- a/third_party/libaom/source/libaom/av1/common/av1_common_int.h
+++ b/third_party/libaom/source/libaom/av1/common/av1_common_int.h
@@ -135,7 +135,10 @@ typedef struct RefCntBuffer {
   // distance when a very old frame is used as a reference.
   unsigned int display_order_hint;
   unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Frame's level within the hierarchical structure.
+  unsigned int pyramid_level;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   MV_REF *mvs;
   uint8_t *seg_map;
   struct segmentation seg;
@@ -192,12 +195,32 @@ typedef struct BufferPool {
 
 /*!\brief Parameters related to CDEF */
 typedef struct {
-  int cdef_damping;                       /*!< CDEF damping factor */
-  int nb_cdef_strengths;                  /*!< Number of CDEF strength values */
-  int cdef_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for luma */
-  int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for
-                                                chroma */
-  int cdef_bits; /*!< Number of CDEF strength values in bits */
+  //! CDEF column line buffer
+  uint16_t *colbuf[MAX_MB_PLANE];
+  //! CDEF top & bottom line buffer
+  uint16_t *linebuf[MAX_MB_PLANE];
+  //! CDEF intermediate buffer
+  uint16_t *srcbuf;
+  //! CDEF column line buffer sizes
+  size_t allocated_colbuf_size[MAX_MB_PLANE];
+  //! CDEF top and bottom line buffer sizes
+  size_t allocated_linebuf_size[MAX_MB_PLANE];
+  //! CDEF intermediate buffer size
+  size_t allocated_srcbuf_size;
+  //! CDEF damping factor
+  int cdef_damping;
+  //! Number of CDEF strength values
+  int nb_cdef_strengths;
+  //! CDEF strength values for luma
+  int cdef_strengths[CDEF_MAX_STRENGTHS];
+  //! CDEF strength values for chroma
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  //! Number of CDEF strength values in bits
+  int cdef_bits;
+  //! Number of rows in the frame in 4 pixel
+  int allocated_mi_rows;
+  //! Number of CDEF workers
+  int allocated_num_workers;
 } CdefInfo;
 
 /*!\cond */
@@ -320,6 +343,10 @@ typedef struct {
 
   unsigned int order_hint;
   unsigned int display_order_hint;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Frame's level within the hierarchical structure.
+  unsigned int pyramid_level;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   unsigned int frame_number;
   SkipModeInfo skip_mode_info;
   int refresh_frame_flags;  // Which ref frames are overwritten by this frame
@@ -602,12 +629,12 @@ struct CommonQuantParams {
 
   /*!
    * Delta of qindex (from base_qindex) for V plane DC coefficients.
-   * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+   * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
    */
   int u_ac_delta_q;
   /*!
    * Delta of qindex (from base_qindex) for V plane AC coefficients.
-   * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+   * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
    */
   int v_ac_delta_q;
 
@@ -728,7 +755,7 @@ typedef struct AV1Common {
   /*!
    * Code and details about current error status.
    */
-  struct aom_internal_error_info error;
+  struct aom_internal_error_info *error;
 
   /*!
    * AV1 allows two types of frame scaling operations:
@@ -780,10 +807,6 @@ typedef struct AV1Common {
   uint8_t superres_scale_denominator;
 
   /*!
-   * If true, buffer removal times are present.
-   */
-  bool buffer_removal_time_present;
-  /*!
    * buffer_removal_times[op_num] specifies the frame removal time in units of
    * DecCT clock ticks counted from the removal time of the last random access
    * point for operating point op_num.
@@ -950,7 +973,7 @@ typedef struct AV1Common {
    * Elements part of the sequence header, that are applicable for all the
    * frames in the video.
    */
-  SequenceHeader seq_params;
+  SequenceHeader *seq_params;
 
   /*!
    * Current CDFs of all the symbols for the current frame.
@@ -982,7 +1005,7 @@ typedef struct AV1Common {
   CommonContexts above_contexts;
 
   /**
-   * \name Signaled when cm->seq_params.frame_id_numbers_present_flag == 1
+   * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1
    */
   /**@{*/
   int current_frame_id;         /*!< frame ID for the current frame. */
@@ -1014,20 +1037,12 @@ typedef struct AV1Common {
   int8_t ref_frame_side[REF_FRAMES];
 
   /*!
-   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
-   */
-  unsigned int number_temporal_layers;
-  /*!
    * Temporal layer ID of this frame
    * (in the range 0 ... (number_temporal_layers - 1)).
    */
   int temporal_layer_id;
 
   /*!
-   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
-   */
-  unsigned int number_spatial_layers;
-  /*!
    * Spatial layer ID of this frame
    * (in the range 0 ... (number_spatial_layers - 1)).
    */
@@ -1192,15 +1207,15 @@ static INLINE RefCntBuffer *get_primary_ref_frame_buf(
 // Returns 1 if this frame might allow mvs from some reference frame.
 static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode &&
-         cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
-         cm->seq_params.order_hint_info.enable_order_hint &&
+         cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
+         cm->seq_params->order_hint_info.enable_order_hint &&
          !frame_is_intra_only(cm);
 }
 
 // Returns 1 if this frame might use warped_motion
 static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
-         cm->seq_params.enable_warped_motion;
+         cm->seq_params->enable_warped_motion;
 }
 
 static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
@@ -1240,7 +1255,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
 void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
 
 static INLINE int av1_num_planes(const AV1_COMMON *cm) {
-  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
 }
 
 static INLINE void av1_init_above_context(CommonContexts *above_contexts,
@@ -1279,8 +1294,8 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
     }
   }
   xd->mi_stride = cm->mi_params.mi_stride;
-  xd->error_info = &cm->error;
-  cfl_init(&xd->cfl, &cm->seq_params);
+  xd->error_info = cm->error;
+  cfl_init(&xd->cfl, cm->seq_params);
 }
 
 static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1562,7 +1577,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
                                           const MACROBLOCKD *xd,
                                           int mi_col_start, int mi_col_end,
                                           const int tile_row) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
   const int aligned_width =
diff --git a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c
index caa15c21e2..18ae0f28f4 100644
--- a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c
+++ b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c
@@ -351,8 +351,14 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = (MAX_MIB_SIZE >> scale_vert);
-  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
   for (int y = 0; y < y_range; y++) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
@@ -376,8 +382,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
       }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+      const int use_highbitdepth = cm->seq_params->use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -456,6 +462,84 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
   }
 }
 
+void av1_filter_block_plane_vert_rt(const AV1_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  assert(!plane);
+  assert(!(y_range % 2));
+  for (int y = 0; y < y_range; y += 2) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          aom_lpf_vertical_6_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_vertical_14_dual(p, dst_stride, params.mblim, params.lim,
+                                   params.hev_thr, params.mblim, params.lim,
+                                   params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
 void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
@@ -464,8 +548,14 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = (MAX_MIB_SIZE >> scale_vert);
-  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
   for (int x = 0; x < x_range; x++) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
@@ -489,8 +579,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
       }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+      const int use_highbitdepth = cm->seq_params->use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -572,6 +662,84 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
   }
 }
 
+void av1_filter_block_plane_horz_rt(const AV1_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  assert(!plane);
+  for (int x = 0; x < x_range; x += 2) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 transform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          aom_lpf_horizontal_6_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_horizontal_14_dual(p, dst_stride, params.mblim, params.lim,
+                                     params.hev_thr, params.mblim, params.lim,
+                                     params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+      // advance the destination pointer
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+    }
+  }
+}
+
 void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
                                       const MACROBLOCKD *const xd,
                                       const int plane,
@@ -661,7 +829,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
 #if CONFIG_LPF_MASK
                              int is_decoding,
 #endif
-                             int plane_start, int plane_end) {
+                             int plane_start, int plane_end, int is_realtime) {
   struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
   const int col_end = cm->mi_params.mi_cols;
@@ -679,7 +847,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
       else if (plane == 2 && !(cm->lf.filter_level_v))
         continue;
 
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+      av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, 0, 0,
                            plane, plane + 1);
 
       av1_build_bitmask_vert_info(cm, &pd[plane], plane);
@@ -716,49 +884,106 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
       continue;
     else if (plane == 2 && !(cm->lf.filter_level_v))
       continue;
-
     if (cm->lf.combine_vert_horz_lf) {
       // filter all vertical and horizontal edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
           // filter vertical edges
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime && !plane) {
+            av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row,
+                                           mi_col);
+
+          } else {
+            av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col);
+          }
+#endif
           // filter horizontal edges
           if (mi_col - MAX_MIB_SIZE >= 0) {
-            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
+            av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
                                  mi_row, mi_col - MAX_MIB_SIZE, plane,
                                  plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+            (void)is_realtime;
             av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                         mi_col - MAX_MIB_SIZE);
+#else
+            if (is_realtime && !plane) {
+              av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+                                             mi_col - MAX_MIB_SIZE);
+            } else {
+              av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                          mi_col - MAX_MIB_SIZE);
+            }
+#endif
           }
         }
         // filter horizontal edges
-        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+        av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, mi_row,
                              mi_col - MAX_MIB_SIZE, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+        (void)is_realtime;
         av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                     mi_col - MAX_MIB_SIZE);
+#else
+        if (is_realtime && !plane) {
+          av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+                                         mi_col - MAX_MIB_SIZE);
+
+        } else {
+          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col - MAX_MIB_SIZE);
+        }
+#endif
       }
     } else {
       // filter all vertical edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime && !plane) {
+            av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row,
+                                           mi_col);
+          } else {
+            av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col);
+          }
+#endif
         }
       }
 
       // filter all horizontal edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+          av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime && !plane) {
+            av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+                                           mi_col);
+
+          } else {
+            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col);
+          }
+#endif
         }
       }
     }
@@ -770,7 +995,8 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
 #if CONFIG_LPF_MASK
                            int is_decoding,
 #endif
-                           int plane_start, int plane_end, int partial_frame) {
+                           int plane_start, int plane_end, int partial_frame,
+                           int is_realtime) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -786,5 +1012,5 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
 #if CONFIG_LPF_MASK
                    is_decoding,
 #endif
-                   plane_start, plane_end);
+                   plane_start, plane_end, is_realtime);
 }
diff --git a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h
index ca16bbe614..ed4453b2a7 100644
--- a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h
+++ b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h
@@ -151,7 +151,7 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
 #else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *xd, int plane_start,
-                           int plane_end, int partial_frame);
+                           int plane_end, int partial_frame, int is_realtime);
 #endif
 
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
@@ -164,6 +164,20 @@ void av1_filter_block_plane_horz(const struct AV1Common *const cm,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
 
+void av1_filter_block_plane_vert_rt(const struct AV1Common *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col);
+
+void av1_filter_block_plane_horz_rt(const struct AV1Common *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col);
+
 uint8_t av1_get_filter_level(const struct AV1Common *cm,
                              const loop_filter_info_n *lfi_n, const int dir_idx,
                              int plane, const MB_MODE_INFO *mbmi);
diff --git a/third_party/libaom/source/libaom/av1/common/blockd.h b/third_party/libaom/source/libaom/av1/common/blockd.h
index 1d1c381bca..5e535add2d 100644
--- a/third_party/libaom/source/libaom/av1/common/blockd.h
+++ b/third_party/libaom/source/libaom/av1/common/blockd.h
@@ -194,11 +194,6 @@ typedef struct RD_STATS {
   int zero_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
-  // TODO(jingning): Temporary solution to silence stack over-size warning
-  // in handle_inter_mode. This should be fixed after rate-distortion
-  // optimization refactoring.
-  int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
-                            [TXB_COEFF_COST_MAP_SIZE];
 #endif  // CONFIG_RD_DEBUG
 } RD_STATS;
 
@@ -325,6 +320,9 @@ typedef struct MB_MODE_INFO {
   int8_t cdef_strength : 4;
   /**@}*/
 
+  /*! \brief Skip CDEF for this superblock */
+  uint8_t skip_cdef_curr_sb;
+
 #if CONFIG_RD_DEBUG
   /*! \brief RD info used for debugging */
   RD_STATS rd_stats;
@@ -552,10 +550,6 @@ typedef struct cfl_ctx {
 
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
-
-#if CONFIG_DEBUG
-  int rate;
-#endif  // CONFIG_DEBUG
 } CFL_CTX;
 
 typedef struct dist_wtd_comp_params {
@@ -810,7 +804,7 @@ typedef struct macroblockd {
   FRAME_CONTEXT *tile_ctx;
 
   /*!
-   * Bit depth: copied from cm->seq_params.bit_depth for convenience.
+   * Bit depth: copied from cm->seq_params->bit_depth for convenience.
    */
   int bd;
 
@@ -893,7 +887,7 @@ typedef struct macroblockd {
   /*!
    * Mask for this block used for compound prediction.
    */
-  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *seg_mask;
 
   /*!
    * CFL (chroma from luma) related parameters.
@@ -937,13 +931,42 @@ typedef struct macroblockd {
 /*!\cond */
 
 static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
+#if CONFIG_AV1_HIGHBITDEPTH
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+#else
+  (void)xd;
+  return 0;
+#endif
 }
 
 static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+#if CONFIG_AV1_HIGHBITDEPTH
   return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
              ? CONVERT_TO_BYTEPTR(buf16)
              : buf16;
+#else
+  (void)xd;
+  return buf16;
+#endif
+}
+
+typedef struct BitDepthInfo {
+  int bit_depth;
+  /*! Is the image buffer high bit depth?
+   * Low bit depth buffer uses uint8_t.
+   * High bit depth buffer uses uint16_t.
+   * Equivalent to cm->seq_params->use_highbitdepth
+   */
+  int use_highbitdepth_buf;
+} BitDepthInfo;
+
+static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
+  BitDepthInfo bit_depth_info;
+  bit_depth_info.bit_depth = xd->bd;
+  bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd);
+  assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf,
+                 bit_depth_info.bit_depth == 8));
+  return bit_depth_info;
 }
 
 static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
diff --git a/third_party/libaom/source/libaom/av1/common/cdef.c b/third_party/libaom/source/libaom/av1/common/cdef.c
index d9b5a104e4..9ab7d4d235 100644
--- a/third_party/libaom/source/libaom/av1/common/cdef.c
+++ b/third_party/libaom/source/libaom/av1/common/cdef.c
@@ -21,35 +21,6 @@
 #include "av1/common/cdef_block.h"
 #include "av1/common/reconinter.h"
 
-enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY);
-
-/*!\brief Parameters related to CDEF Block */
-typedef struct {
-  uint16_t *src;
-  uint8_t *dst;
-  uint16_t *colbuf[MAX_MB_PLANE];
-  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
-
-  int xdec;
-  int ydec;
-  int mi_wide_l2;
-  int mi_high_l2;
-  int frame_boundary[BOUNDARIES];
-
-  int damping;
-  int coeff_shift;
-  int level;
-  int sec_strength;
-  int cdef_count;
-  int is_zero_level;
-  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS];
-  int var[CDEF_NBLOCKS][CDEF_NBLOCKS];
-
-  int dst_stride;
-  int coffset;
-  int roffset;
-} CdefBlockInfo;
-
 static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
   MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
@@ -116,10 +87,10 @@ void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
   }
 }
 
-static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
-                        const uint8_t *src, int src_voffset, int src_hoffset,
-                        int sstride, int vsize, int hsize) {
-  if (cm->seq_params.use_highbitdepth) {
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+                          int dstride, const uint8_t *src, int src_voffset,
+                          int src_hoffset, int sstride, int vsize, int hsize) {
+  if (cm->seq_params->use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
     cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@@ -151,29 +122,35 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
 // Inputs:
 //   cm: Pointer to common structure.
 //   fb_info: Pointer to the CDEF block-level parameter structure.
-//   linebuf: Top feedback buffer for CDEF.
+//   colbuf: Left column buffer for CDEF.
 //   cdef_left: Left block is filtered or not.
 //   fbc, fbr: col and row index of a block.
 //   plane: plane index Y/CB/CR.
-//   prev_row_cdef: Top blocks are filtered or not.
 // Returns:
 //   Nothing will be returned.
-static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info,
-                            uint16_t **linebuf, const int *cdef_left, int fbc,
-                            int fbr, uint8_t plane,
-                            unsigned char *prev_row_cdef) {
+static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info,
+                            uint16_t **const colbuf, const int *cdef_left,
+                            int fbc, int fbr, int plane) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   uint16_t *src = fb_info->src;
-  const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4);
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   int cstart = 0;
   if (!*cdef_left) cstart = -CDEF_HBORDER;
   int rend, cend;
-  int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-  int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-  int hsize = nhb << fb_info->mi_wide_l2;
-  int vsize = nvb << fb_info->mi_high_l2;
+  const int nhb =
+      AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+  const int nvb =
+      AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+  const int hsize = nhb << fb_info->mi_wide_l2;
+  const int vsize = nvb << fb_info->mi_high_l2;
+  const uint16_t *top_linebuf = fb_info->top_linebuf[plane];
+  const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane];
+  const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
+  const int stride =
+      luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x);
 
   if (fbc == nhfb - 1)
     cend = hsize;
@@ -185,54 +162,55 @@ static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info,
   else
     rend = vsize + CDEF_VBORDER;
 
-  if (fbc == nhfb - 1) {
-    /* On the last superblock column, fill in the right border with
-    CDEF_VERY_LARGE to avoid filtering with the outside. */
-    fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE, rend + CDEF_VBORDER,
-              hsize + CDEF_HBORDER - cend, CDEF_VERY_LARGE);
-  }
-  if (fbr == nvfb - 1) {
-    /* On the last superblock row, fill in the bottom border with
-    CDEF_VERY_LARGE to avoid filtering with the outside. */
-    fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
-              CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
-  }
   /* Copy in the pixels we need from the current superblock for
   deringing.*/
-  copy_sb8_16(cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
-              CDEF_BSTRIDE, fb_info->dst, fb_info->roffset,
-              fb_info->coffset + cstart, fb_info->dst_stride, rend,
-              cend - cstart);
-  if (!prev_row_cdef[fbc]) {
-    copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE, fb_info->dst,
-                fb_info->roffset - CDEF_VBORDER, fb_info->coffset,
-                fb_info->dst_stride, CDEF_VBORDER, hsize);
-  } else if (fbr > 0) {
-    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE,
-              &linebuf[plane][fb_info->coffset], stride, CDEF_VBORDER, hsize);
+  av1_cdef_copy_sb8_16(
+      cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+      CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart,
+      fb_info->dst_stride, vsize, cend - cstart);
+
+  /* Copy in the pixels we need for the current superblock from bottom buffer.*/
+  if (fbr < nvfb - 1) {
+    copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize);
+  } else {
+    fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+              hsize, CDEF_VERY_LARGE);
+  }
+  if (fbr < nvfb - 1 && fbc > 0) {
+    copy_rect(&src[bot_offset], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride,
+              CDEF_VBORDER, CDEF_HBORDER);
+  } else {
+    fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (fbr < nvfb - 1 && fbc < nhfb - 1) {
+    copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
+              CDEF_HBORDER);
+  } else {
+    fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+
+  /* Copy in the pixels we need from the current superblock from top buffer.*/
+  if (fbr > 0) {
+    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
+              stride, CDEF_VBORDER, hsize);
   } else {
     fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
               CDEF_VERY_LARGE);
   }
-  if (!prev_row_cdef[fbc - 1]) {
-    copy_sb8_16(cm, src, CDEF_BSTRIDE, fb_info->dst,
-                fb_info->roffset - CDEF_VBORDER,
-                fb_info->coffset - CDEF_HBORDER, fb_info->dst_stride,
-                CDEF_VBORDER, CDEF_HBORDER);
-  } else if (fbr > 0 && fbc > 0) {
-    copy_rect(src, CDEF_BSTRIDE,
-              &linebuf[plane][fb_info->coffset - CDEF_HBORDER], stride,
-              CDEF_VBORDER, CDEF_HBORDER);
+  if (fbr > 0 && fbc > 0) {
+    copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
+              stride, CDEF_VBORDER, CDEF_HBORDER);
   } else {
     fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
   }
-  if (!prev_row_cdef[fbc + 1]) {
-    copy_sb8_16(cm, &src[CDEF_HBORDER + hsize], CDEF_BSTRIDE, fb_info->dst,
-                fb_info->roffset - CDEF_VBORDER, fb_info->coffset + hsize,
-                fb_info->dst_stride, CDEF_VBORDER, CDEF_HBORDER);
-  } else if (fbr > 0 && fbc < nhfb - 1) {
+  if (fbr > 0 && fbc < nhfb - 1) {
     copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
-              &linebuf[plane][fb_info->coffset + hsize], stride, CDEF_VBORDER,
+              &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
               CDEF_HBORDER);
   } else {
     fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
@@ -241,36 +219,25 @@ static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info,
   if (*cdef_left) {
     /* If we deringed the superblock on the left then we need to copy in
     saved pixels. */
-    copy_rect(src, CDEF_BSTRIDE, fb_info->colbuf[plane], CDEF_HBORDER,
+    copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
               rend + CDEF_VBORDER, CDEF_HBORDER);
   }
   /* Saving pixels in case we need to dering the superblock on the
   right. */
-  copy_rect(fb_info->colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+  copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
             rend + CDEF_VBORDER, CDEF_HBORDER);
-  copy_sb8_16(cm, &linebuf[plane][fb_info->coffset], stride, fb_info->dst,
-              (MI_SIZE_64X64 << fb_info->mi_high_l2) * (fbr + 1) - CDEF_VBORDER,
-              fb_info->coffset, fb_info->dst_stride, CDEF_VBORDER, hsize);
 
-  if (fb_info->frame_boundary[TOP]) {
-    fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
-              CDEF_VERY_LARGE);
-  }
   if (fb_info->frame_boundary[LEFT]) {
     fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
               CDEF_VERY_LARGE);
   }
-  if (fb_info->frame_boundary[BOTTOM]) {
-    fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
-              CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
-  }
   if (fb_info->frame_boundary[RIGHT]) {
     fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
               vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
   }
 }
 
-static INLINE void cdef_filter_fb(CdefBlockInfo *fb_info, uint8_t plane,
+static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
                                   uint8_t use_highbitdepth) {
   int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
   if (use_highbitdepth) {
@@ -291,11 +258,11 @@ static INLINE void cdef_filter_fb(CdefBlockInfo *fb_info, uint8_t plane,
 }
 
 // Initializes block-level parameters for CDEF.
-static INLINE void cdef_init_fb_col(MACROBLOCKD *xd,
+static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd,
                                     const CdefInfo *const cdef_info,
-                                    CdefBlockInfo *fb_info,
-                                    const int mbmi_cdef_strength, int fbc,
-                                    int fbr, uint8_t plane) {
+                                    CdefBlockInfo *const fb_info,
+                                    int mbmi_cdef_strength, int fbc, int fbr,
+                                    int plane) {
   if (plane == AOM_PLANE_Y) {
     fb_info->level =
         cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
@@ -328,9 +295,9 @@ static INLINE void cdef_init_fb_col(MACROBLOCKD *xd,
   fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2;
 }
 
-static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info,
-                        int fbc, int fbr, int *cdef_left, uint16_t **linebuf,
-                        unsigned char *prev_row_cdef) {
+static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                        CdefBlockInfo *const fb_info, uint16_t **const colbuf,
+                        int *cdef_left, int fbc, int fbr) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mbmi_cdef_strength =
       mi_params
@@ -343,9 +310,9 @@ static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info,
                               MI_SIZE_64X64 * fbc] == NULL ||
       mbmi_cdef_strength == -1) {
     *cdef_left = 0;
-    return 0;
+    return;
   }
-  for (uint8_t plane = 0; plane < num_planes; plane++) {
+  for (int plane = 0; plane < num_planes; plane++) {
     cdef_init_fb_col(xd, &cm->cdef_info, fb_info, mbmi_cdef_strength, fbc, fbr,
                      plane);
     if (fb_info->is_zero_level ||
@@ -353,20 +320,26 @@ static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info,
              mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64,
              fb_info->dlist, BLOCK_64X64)) == 0) {
       *cdef_left = 0;
-      return 0;
+      return;
     }
-    cdef_prepare_fb(cm, fb_info, linebuf, cdef_left, fbc, fbr, plane,
-                    prev_row_cdef);
-    cdef_filter_fb(fb_info, plane, cm->seq_params.use_highbitdepth);
+    cdef_prepare_fb(cm, fb_info, colbuf, cdef_left, fbc, fbr, plane);
+    cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth);
   }
   *cdef_left = 1;
-  return 1;
 }
 
-static INLINE void cdef_init_fb_row(CdefBlockInfo *fb_info, int mi_rows,
-                                    int fbr) {
-  const int nvfb = (mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          CdefBlockInfo *const fb_info,
+                          uint16_t **const linebuf, uint16_t *const src,
+                          struct AV1CdefSyncData *const cdef_sync, int fbr) {
+  (void)cdef_sync;
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+  const bool ping_pong = fbr & 1;
   // for the current filter block, it's top left corner mi structure (mi_tl)
   // is first accessed to check whether the top and left boundaries are
   // frame boundaries. Then bottom-left and top-right mi structures are
@@ -379,78 +352,58 @@ static INLINE void cdef_init_fb_row(CdefBlockInfo *fb_info, int mi_rows,
   fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
   if (fbr != nvfb - 1)
     fb_info->frame_boundary[BOTTOM] =
-        (MI_SIZE_64X64 * (fbr + 1) == mi_rows) ? 1 : 0;
+        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
   else
     fb_info->frame_boundary[BOTTOM] = 1;
+
+  fb_info->src = src;
+  fb_info->damping = cm->cdef_info.cdef_damping;
+  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  av1_zero(fb_info->dir);
+  av1_zero(fb_info->var);
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+    const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+    // here ping-pong buffers are maintained for top linebuf
+    // to avoid linebuf over-write by consecutive row.
+    uint16_t *const top_linebuf =
+        &linebuf[plane][ping_pong * CDEF_VBORDER * stride];
+    fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride];
+
+    if (fbr != nvfb - 1)  // top line buffer copy
+      av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf,
+                           offset - CDEF_VBORDER, 0,
+                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+    fb_info->top_linebuf[plane] =
+        &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride];
+
+    if (fbr != nvfb - 1)  // bottom line buffer copy
+      av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride,
+                           xd->plane[plane].dst.buf, offset, 0,
+                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+  }
 }
 
-static void cdef_fb_row(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info,
-                        uint16_t **linebuf, int fbr,
-                        unsigned char *curr_row_cdef,
-                        unsigned char *prev_row_cdef) {
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                     uint16_t **const linebuf, uint16_t **const colbuf,
+                     uint16_t *const src, int fbr,
+                     cdef_init_fb_row_t cdef_init_fb_row_fn,
+                     struct AV1CdefSyncData *const cdef_sync) {
+  CdefBlockInfo fb_info;
   int cdef_left = 1;
   const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
 
-  cdef_init_fb_row(fb_info, cm->mi_params.mi_rows, fbr);
+  cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
   for (int fbc = 0; fbc < nhfb; fbc++) {
-    fb_info->frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
+    fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
     if (fbc != nhfb - 1)
-      fb_info->frame_boundary[RIGHT] =
+      fb_info.frame_boundary[RIGHT] =
           (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
     else
-      fb_info->frame_boundary[RIGHT] = 1;
-    curr_row_cdef[fbc] = cdef_fb_col(cm, xd, fb_info, fbc, fbr, &cdef_left,
-                                     linebuf, prev_row_cdef);
-  }
-}
-
-// Initialize the frame-level CDEF parameters.
-// Inputs:
-//   frame: Pointer to input frame buffer.
-//   cm: Pointer to common structure.
-//   xd: Pointer to common current coding block structure.
-//   fb_info: Pointer to the CDEF block-level parameter structure.
-//   src: Intermediate input buffer for CDEF.
-//   colbuf: Left feedback buffer for CDEF.
-//   linebuf: Top feedback buffer for CDEF.
-// Returns:
-//   Nothing will be returned.
-static void cdef_prepare_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                               MACROBLOCKD *xd, CdefBlockInfo *fb_info,
-                               uint16_t *src, uint16_t **colbuf,
-                               uint16_t **linebuf) {
-  const int num_planes = av1_num_planes(cm);
-  const int stride = (cm->mi_params.mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
-
-  for (uint8_t plane = 0; plane < num_planes; plane++) {
-    linebuf[plane] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
-    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
-    const int block_height = (MI_SIZE_64X64 << mi_high_l2) + 2 * CDEF_VBORDER;
-    colbuf[plane] = aom_malloc(
-        sizeof(*colbuf) *
-        ((CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) +
-         2 * CDEF_VBORDER) *
-        CDEF_HBORDER);
-    fill_rect(colbuf[plane], CDEF_HBORDER, block_height, CDEF_HBORDER,
-              CDEF_VERY_LARGE);
-    fb_info->colbuf[plane] = colbuf[plane];
-  }
-
-  fb_info->src = src;
-  fb_info->damping = cm->cdef_info.cdef_damping;
-  fb_info->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  memset(fb_info->dir, 0, sizeof(fb_info->dir));
-  memset(fb_info->var, 0, sizeof(fb_info->var));
-}
-
-static void cdef_free(unsigned char *row_cdef, uint16_t **colbuf,
-                      uint16_t **linebuf, const int num_planes) {
-  aom_free(row_cdef);
-  for (uint8_t plane = 0; plane < num_planes; plane++) {
-    aom_free(colbuf[plane]);
-    aom_free(linebuf[plane]);
+      fb_info.frame_boundary[RIGHT] = 1;
+    cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left, fbc, fbr);
   }
 }
 
@@ -461,29 +414,15 @@ static void cdef_free(unsigned char *row_cdef, uint16_t **colbuf,
 //   xd: Pointer to common current coding block structure.
 // Returns:
 //   Nothing will be returned.
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                    MACROBLOCKD *xd) {
-  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
-  uint16_t *colbuf[MAX_MB_PLANE] = { NULL };
-  uint16_t *linebuf[MAX_MB_PLANE] = { NULL };
-  CdefBlockInfo fb_info;
-  unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef;
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) {
   const int num_planes = av1_num_planes(cm);
   const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
 
-  row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
-  memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
-  prev_row_cdef = row_cdef + 1;
-  curr_row_cdef = prev_row_cdef + nhfb + 2;
-  cdef_prepare_frame(frame, cm, xd, &fb_info, src, colbuf, linebuf);
-
-  for (int fbr = 0; fbr < nvfb; fbr++) {
-    unsigned char *tmp;
-    cdef_fb_row(cm, xd, &fb_info, linebuf, fbr, curr_row_cdef, prev_row_cdef);
-    tmp = prev_row_cdef;
-    prev_row_cdef = curr_row_cdef;
-    curr_row_cdef = tmp;
-  }
-  cdef_free(row_cdef, colbuf, linebuf, num_planes);
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+
+  for (int fbr = 0; fbr < nvfb; fbr++)
+    av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
+                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL);
 }
diff --git a/third_party/libaom/source/libaom/av1/common/cdef.h b/third_party/libaom/source/libaom/av1/common/cdef.h
index 4d6e60023b..194117884e 100644
--- a/third_party/libaom/source/libaom/av1/common/cdef.h
+++ b/third_party/libaom/source/libaom/av1/common/cdef.h
@@ -23,6 +23,40 @@
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cdef_block.h"
 
+enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY);
+
+struct AV1CdefSyncData;
+
+/*!\brief Parameters related to CDEF Block */
+typedef struct {
+  uint16_t *src;                       /*!< CDEF intermediate buffer */
+  uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */
+  uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */
+  uint8_t *dst;                        /*!< CDEF destination buffer */
+  cdef_list
+      dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */
+
+  int xdec;                       /*!< Sub-sampling X */
+  int ydec;                       /*!< Sub-sampling X */
+  int mi_wide_l2;                 /*!< Pixels per mi unit in width */
+  int mi_high_l2;                 /*!< Pixels per mi unit in height */
+  int frame_boundary[BOUNDARIES]; /*!< frame boundaries */
+
+  int damping;     /*!< CDEF damping factor */
+  int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */
+  int level;       /*!< CDEF filtering level */
+  int sec_strength;  /*!< CDEF secondary strength */
+  int cdef_count;    /*!< Number of CDEF sub-blocks in superblock */
+  int is_zero_level; /*!< CDEF filtering level ON/OFF */
+  int dir[CDEF_NBLOCKS]
+         [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */
+
+  int dst_stride; /*!< CDEF destination buffer stride */
+  int coffset;    /*!< current superblock offset in a row */
+  int roffset;    /*!< current row offset */
+} CdefBlockInfo;
+
 static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
 
 static INLINE int constrain(int diff, int threshold, int damping) {
@@ -41,19 +75,36 @@ int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, cdef_list *dlist,
                              BLOCK_SIZE bsize);
 
+typedef void (*cdef_init_fb_row_t)(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src,
+    struct AV1CdefSyncData *const cdef_sync, int fbr);
+
 /*!\brief Function for applying CDEF to a frame
  *
  * \ingroup in_loop_cdef
  * This function applies CDEF to a frame.
  *
- * \param[in, out]  frame       Compressed frame buffer
- * \param[in, out]  cm          Pointer to top level common structure
- * \param[in]       xd          Pointer to common current coding block structure
+ * \param[in, out]  frame     Compressed frame buffer
+ * \param[in, out]  cm        Pointer to top level common structure
+ * \param[in]       xd        Pointer to common current coding block structure
+ * \param[in]       cdef_init_fb_row_fn   Function Pointer
  *
  * \return Nothing is returned. Instead, the filtered frame is output in
  * \c frame.
  */
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                     uint16_t **const linebuf, uint16_t **const colbuf,
+                     uint16_t *const src, int fbr,
+                     cdef_init_fb_row_t cdef_init_fb_row_fn,
+                     struct AV1CdefSyncData *const cdef_sync);
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          CdefBlockInfo *const fb_info,
+                          uint16_t **const linebuf, uint16_t *const src,
+                          struct AV1CdefSyncData *const cdef_sync, int fbr);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/libaom/source/libaom/av1/common/cdef_block.h b/third_party/libaom/source/libaom/av1/common/cdef_block.h
index 6b0ae0a9db..574df2d0de 100644
--- a/third_party/libaom/source/libaom/av1/common/cdef_block.h
+++ b/third_party/libaom/source/libaom/av1/common/cdef_block.h
@@ -19,8 +19,8 @@
 #define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
 #define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
 
-/* We need to buffer three vertical lines. */
-#define CDEF_VBORDER (3)
+/* We need to buffer two vertical lines. */
+#define CDEF_VBORDER (2)
 /* We only need to buffer three horizontal pixels too, but let's align to
    16 bytes (8 x 16 bits) to make vectorization easier. */
 #define CDEF_HBORDER (8)
diff --git a/third_party/libaom/source/libaom/av1/common/cfl.h b/third_party/libaom/source/libaom/av1/common/cfl.h
index 0062e9f7ba..0d53764f28 100644
--- a/third_party/libaom/source/libaom/av1/common/cfl.h
+++ b/third_party/libaom/source/libaom/av1/common/cfl.h
@@ -39,7 +39,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
                                                   const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
 
-  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
 
   if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,
diff --git a/third_party/libaom/source/libaom/av1/common/common.h b/third_party/libaom/source/libaom/av1/common/common.h
index bed6083db2..cc2da98a16 100644
--- a/third_party/libaom/source/libaom/av1/common/common.h
+++ b/third_party/libaom/source/libaom/av1/common/common.h
@@ -50,7 +50,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 }
 
 #define CHECK_MEM_ERROR(cm, lval, expr) \
-  AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+  AOM_CHECK_MEM_ERROR(cm->error, lval, expr)
 
 #define AOM_FRAME_MARKER 0x2
 
diff --git a/third_party/libaom/source/libaom/av1/common/common_data.h b/third_party/libaom/source/libaom/av1/common/common_data.h
index 402845cafe..38e14714c0 100644
--- a/third_party/libaom/source/libaom/av1/common/common_data.h
+++ b/third_party/libaom/source/libaom/av1/common/common_data.h
@@ -434,9 +434,12 @@ static const int intra_mode_context[INTRA_MODES] = {
 static const int quant_dist_weight[4][2] = {
   { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
 };
-static const int quant_dist_lookup_table[2][4][2] = {
-  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
-  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
+
+static const int quant_dist_lookup_table[4][2] = {
+  { 9, 7 },
+  { 11, 5 },
+  { 12, 4 },
+  { 13, 3 },
 };
 
 #ifdef __cplusplus
diff --git a/third_party/libaom/source/libaom/av1/common/enums.h b/third_party/libaom/source/libaom/av1/common/enums.h
index 9c2976b08d..0e1e744daf 100644
--- a/third_party/libaom/source/libaom/av1/common/enums.h
+++ b/third_party/libaom/source/libaom/av1/common/enums.h
@@ -321,6 +321,7 @@ enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
 #define CFL_ALPHABET_SIZE_LOG2 4
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
 #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
+#define CFL_INDEX_ZERO CFL_ALPHABET_SIZE
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
 #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
 
@@ -451,6 +452,14 @@ enum {
   UV_MODE_INVALID,  // For uv_mode in inter blocks
 } UENUM1BYTE(UV_PREDICTION_MODE);
 
+// Number of top model rd to store for pruning y modes in intra mode decision
+#define TOP_INTRA_MODEL_COUNT 4
+// Total number of luma intra prediction modes (include both directional and
+// non-directional modes)
+// 61 = PAETH_PRED - DC_PRED + 1 + 6 * 8
+// Because there are 8 directional modes, each has additional 6 delta angles.
+#define LUMA_MODE_COUNT 61
+
 enum {
   SIMPLE_TRANSLATION,
   OBMC_CAUSAL,    // 2-sided OBMC
diff --git a/third_party/libaom/source/libaom/av1/common/loopfiltermask.c b/third_party/libaom/source/libaom/av1/common/loopfiltermask.c
index 1ae0b112ce..22ab0adf2c 100644
--- a/third_party/libaom/source/libaom/av1/common/loopfiltermask.c
+++ b/third_party/libaom/source/libaom/av1/common/loopfiltermask.c
@@ -1002,11 +1002,11 @@ void av1_filter_block_plane_bitmask_vert(
     }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
+    if (cm->seq_params->use_highbitdepth)
       highbd_filter_selectively_vert_row2(
           ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
           mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params->bit_depth);
     else
       filter_selectively_vert_row2(
           ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
@@ -1075,10 +1075,11 @@ void av1_filter_block_plane_bitmask_horz(
     mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    if (cm->seq_params->use_highbitdepth)
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, pl, ssx, mask_16x16,
+                                      mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                      (int)cm->seq_params->bit_depth);
     else
       filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
                                mask_8x8, mask_4x4, &cm->lf_info, lfl);
@@ -1109,10 +1110,10 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
   uint8_t *lfl2;
 
   // filter two rows at a time
-  for (r = 0; r < cm->seq_params.mib_size &&
+  for (r = 0; r < cm->seq_params->mib_size &&
               ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
        r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
+    for (c = 0; c < cm->seq_params->mib_size &&
                 ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
          c += MI_SIZE_64X64) {
       dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
@@ -1159,11 +1160,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
       uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-      if (cm->seq_params.use_highbitdepth)
+      if (cm->seq_params->use_highbitdepth)
         highbd_filter_selectively_vert_row2(
             ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
             mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params->bit_depth);
       else
         filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
                                      mask_16x16_0, mask_8x8_0, mask_4x4_0,
@@ -1194,10 +1195,10 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
   uint64_t mask_4x4 = 0;
   uint8_t *lfl;
 
-  for (r = 0; r < cm->seq_params.mib_size &&
+  for (r = 0; r < cm->seq_params->mib_size &&
               ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
        r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
+    for (c = 0; c < cm->seq_params->mib_size &&
                 ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
          c += MI_SIZE_64X64) {
       if (mi_row + r == 0) continue;
@@ -1235,11 +1236,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
       mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-      if (cm->seq_params.use_highbitdepth)
+      if (cm->seq_params->use_highbitdepth)
         highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                         dst->stride, pl, ssx, mask_16x16,
                                         mask_8x8, mask_4x4, &cm->lf_info, lfl,
-                                        (int)cm->seq_params.bit_depth);
+                                        (int)cm->seq_params->bit_depth);
       else
         filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
                                  mask_8x8, mask_4x4, &cm->lf_info, lfl);
@@ -1260,9 +1261,11 @@ void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
   const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
   const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
   const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
+      mbmi->bsize, cm->seq_params->subsampling_x,
+      cm->seq_params->subsampling_y)];
   const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
+      mbmi->bsize, cm->seq_params->subsampling_x,
+      cm->seq_params->subsampling_y)];
   const int is_square_transform_size = tx_size <= TX_64X64;
   int mask_id = 0;
   int offset = 0;
@@ -1330,9 +1333,11 @@ void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
   const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
   const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
   const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
+      mbmi->bsize, cm->seq_params->subsampling_x,
+      cm->seq_params->subsampling_y)];
   const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
+      mbmi->bsize, cm->seq_params->subsampling_x,
+      cm->seq_params->subsampling_y)];
   const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
   int mask_id = 0;
   int offset = 0;
diff --git a/third_party/libaom/source/libaom/av1/common/mv.h b/third_party/libaom/source/libaom/av1/common/mv.h
index be539e8201..3203bf7278 100644
--- a/third_party/libaom/source/libaom/av1/common/mv.h
+++ b/third_party/libaom/source/libaom/av1/common/mv.h
@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_COMMON_MV_H_
 #define AOM_AV1_COMMON_MV_H_
 
+#include <stdlib.h>
+
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/libaom/source/libaom/av1/common/mvref_common.c b/third_party/libaom/source/libaom/av1/common/mvref_common.c
index 04e050a691..3431e7d6ad 100644
--- a/third_party/libaom/source/libaom/av1/common/mvref_common.c
+++ b/third_party/libaom/source/libaom/av1/common/mvref_common.c
@@ -258,7 +258,7 @@ static AOM_INLINE void scan_blk_mbmi(
 
 static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int mask_row = mi_row & (sb_mi_size - 1);
   const int mask_col = mi_col & (sb_mi_size - 1);
 
@@ -347,7 +347,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   const int cur_frame_index = cm->cur_frame->order_hint;
   const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
   const int frame0_index = buf_0->order_hint;
-  const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+  const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info,
                                              cur_frame_index, frame0_index);
   int idx;
   const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
@@ -380,7 +380,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     // Process compound inter mode
     const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
     const int frame1_index = buf_1->order_hint;
-    const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+    const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info,
                                                cur_frame_index, frame1_index);
     int_mv comp_refmv;
     get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
@@ -838,7 +838,9 @@ void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
   cm->cur_frame->order_hint = cm->current_frame.order_hint;
   cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
@@ -854,10 +856,10 @@ void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
+    if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) {
       const int ref_order_hint = buf->order_hint;
       cm->ref_frame_sign_bias[ref_frame] =
-          (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
+          (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint,
                              (int)cm->current_frame.order_hint) <= 0)
               ? 0
               : 1;
@@ -930,10 +932,10 @@ static int motion_field_projection(AV1_COMMON *cm,
       &start_frame_buf->ref_order_hints[0];
   const int cur_order_hint = cm->cur_frame->order_hint;
   int start_to_current_frame_offset = get_relative_dist(
-      &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint);
+      &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint);
 
   for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info,
+    ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info,
                                        start_frame_order_hint,
                                        ref_order_hints[rf - LAST_FRAME]);
   }
@@ -981,7 +983,7 @@ static int motion_field_projection(AV1_COMMON *cm,
 }
 
 void av1_setup_motion_field(AV1_COMMON *cm) {
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 
   memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
   if (!order_hint_info->enable_order_hint) return;
@@ -1219,7 +1221,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
 }
 
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
   SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
 
   skip_mode_info->skip_mode_allowed = 0;
@@ -1323,11 +1325,11 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
   int lst_frame_sort_idx = -1;
   int gld_frame_sort_idx = -1;
 
-  assert(cm->seq_params.order_hint_info.enable_order_hint);
-  assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0);
+  assert(cm->seq_params->order_hint_info.enable_order_hint);
+  assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0);
   const int cur_order_hint = (int)cm->current_frame.order_hint;
   const int cur_frame_sort_idx =
-      1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1;
+      1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1;
 
   REF_FRAME_INFO ref_frame_info[REF_FRAMES];
   int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
@@ -1349,7 +1351,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
     ref_frame_info[i].sort_idx =
         (offset == -1) ? -1
                        : cur_frame_sort_idx +
-                             get_relative_dist(&cm->seq_params.order_hint_info,
+                             get_relative_dist(&cm->seq_params->order_hint_info,
                                                offset, cur_order_hint);
     assert(ref_frame_info[i].sort_idx >= -1);
 
@@ -1360,11 +1362,11 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
   // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
   // frames.
   if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Inter frame requests a look-ahead frame as LAST");
   }
   if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Inter frame requests a look-ahead frame as GOLDEN");
   }
 
diff --git a/third_party/libaom/source/libaom/av1/common/pred_common.h b/third_party/libaom/source/libaom/av1/common/pred_common.h
index 12bcce84f2..3db9dd69ef 100644
--- a/third_party/libaom/source/libaom/av1/common/pred_common.h
+++ b/third_party/libaom/source/libaom/av1/common/pred_common.h
@@ -107,9 +107,9 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm,
   if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
   if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                   fwd_frame_index, cur_frame_index));
-  int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                   cur_frame_index, bck_frame_index));
 
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
diff --git a/third_party/libaom/source/libaom/av1/common/reconinter.c b/third_party/libaom/source/libaom/av1/common/reconinter.c
index ad155b26ae..70f4c6d5ee 100644
--- a/third_party/libaom/source/libaom/av1/common/reconinter.c
+++ b/third_party/libaom/source/libaom/av1/common/reconinter.c
@@ -713,8 +713,8 @@ void av1_build_one_inter_predictor(
 }
 
 void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
+                                     const MB_MODE_INFO *mbmi, int *fwd_offset,
+                                     int *bck_offset,
                                      int *use_dist_wtd_comp_avg,
                                      int is_compound) {
   assert(fwd_offset != NULL && bck_offset != NULL);
@@ -734,18 +734,18 @@ void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
   if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
   if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                        fwd_frame_index, cur_frame_index)),
                  0, MAX_FRAME_DISTANCE);
-  int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                        cur_frame_index, bck_frame_index)),
                  0, MAX_FRAME_DISTANCE);
 
   const int order = d0 <= d1;
 
   if (d0 == 0 || d1 == 0) {
-    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
-    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+    *fwd_offset = quant_dist_lookup_table[3][order];
+    *bck_offset = quant_dist_lookup_table[3][1 - order];
     return;
   }
 
@@ -758,8 +758,8 @@ void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
     if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
   }
 
-  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
-  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+  *fwd_offset = quant_dist_lookup_table[i][order];
+  *bck_offset = quant_dist_lookup_table[i][1 - order];
 }
 
 // True if the following hold:
@@ -911,7 +911,7 @@ static void build_inter_predictors_8x8_and_bigger(
         ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
 
     av1_dist_wtd_comp_weight_assign(
-        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        cm, mi, &inter_pred_params.conv_params.fwd_offset,
         &inter_pred_params.conv_params.bck_offset,
         &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
 
@@ -1189,7 +1189,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
 void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
                              uint8_t **dst_buf2) {
-#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
@@ -1203,16 +1202,13 @@ void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
     dst_buf2[2] =
         CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
   } else {
-#endif  // CONFIG_AV1_HIGHBITDEPTH
     dst_buf1[0] = xd->tmp_obmc_bufs[0];
     dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
     dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
     dst_buf2[0] = xd->tmp_obmc_bufs[1];
     dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
     dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-#if CONFIG_AV1_HIGHBITDEPTH
   }
-#endif  // CONFIG_AV1_HIGHBITDEPTH
 }
 
 void av1_setup_build_prediction_by_above_pred(
@@ -1363,10 +1359,12 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
   assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
   assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
   assert(xd->mi[0]->use_intrabc == 0);
+  const SequenceHeader *seq_params = cm->seq_params;
 
-  av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
-                          FILTER_INTRA_MODES, ctx->plane[plane],
+  av1_predict_intra_block(xd, seq_params->sb_size,
+                          seq_params->enable_intra_edge_filter, pd->width,
+                          pd->height, max_txsize_rect_lookup[plane_bsize], mode,
+                          0, 0, FILTER_INTRA_MODES, ctx->plane[plane],
                           ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 }
 
diff --git a/third_party/libaom/source/libaom/av1/common/reconinter.h b/third_party/libaom/source/libaom/av1/common/reconinter.h
index c8696160b6..056dc67d07 100644
--- a/third_party/libaom/source/libaom/av1/common/reconinter.h
+++ b/third_party/libaom/source/libaom/av1/common/reconinter.h
@@ -368,8 +368,8 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
 }
 
 void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
+                                     const MB_MODE_INFO *mbmi, int *fwd_offset,
+                                     int *bck_offset,
                                      int *use_dist_wtd_comp_avg,
                                      int is_compound);
 
diff --git a/third_party/libaom/source/libaom/av1/common/reconintra.c b/third_party/libaom/source/libaom/av1/common/reconintra.c
index 0c01f92183..51b01786ea 100644
--- a/third_party/libaom/source/libaom/av1/common/reconintra.c
+++ b/third_party/libaom/source/libaom/av1/common/reconintra.c
@@ -193,7 +193,7 @@ static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
   return ret;
 }
 
-static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int top_available, int right_available,
                          PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
                          int col_off, int ss_x, int ss_y) {
@@ -223,7 +223,7 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
     const int bh_in_mi_log2 = mi_size_high_log2[bsize];
-    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int sb_mi_size = mi_size_high[sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -378,7 +378,7 @@ static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
   return ret;
 }
 
-static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
                            int mi_col, int bottom_available, int left_available,
                            PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
                            int col_off, int ss_x, int ss_y) {
@@ -415,7 +415,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
     const int bh_in_mi_log2 = mi_size_high_log2[bsize];
-    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int sb_mi_size = mi_size_high[sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -971,7 +971,7 @@ static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
   }
 }
 
-static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) {
   int ab_sm, le_sm;
 
   if (plane == 0) {
@@ -1144,11 +1144,11 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
 }
 #if CONFIG_AV1_HIGHBITDEPTH
 static void build_intra_predictors_high(
-    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
-    int dst_stride, PREDICTION_MODE mode, int angle_delta,
-    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
-    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
-    int n_bottomleft_px, int plane) {
+    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+    PREDICTION_MODE mode, int angle_delta, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
+    int bit_depth) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -1166,7 +1166,7 @@ static void build_intra_predictors_high(
   int p_angle = 0;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  int base = 128 << (xd->bd - 8);
+  int base = 128 << (bit_depth - 8);
   // The left_data, above_data buffers must be zeroed to fix some intermittent
   // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
   // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
@@ -1270,7 +1270,7 @@ static void build_intra_predictors_high(
 
   if (use_filter_intra) {
     highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                                  filter_intra_mode, xd->bd);
+                                  filter_intra_mode, bit_depth);
     return;
   }
 
@@ -1280,61 +1280,57 @@ static void build_intra_predictors_high(
     if (!disable_edge_filter) {
       const int need_right = p_angle < 90;
       const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
       if (p_angle != 90 && p_angle != 180) {
         const int ab_le = need_above_left ? 1 : 0;
         if (need_above && need_left && (txwpx + txhpx >= 24)) {
           filter_intra_edge_corner_high(above_row, left_col);
         }
         if (need_above && n_top_px > 0) {
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
           const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
           av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
         }
         if (need_left && n_left_px > 0) {
           const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
           const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
           av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
         }
       }
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+        av1_upsample_intra_edge_high(above_row, n_px, bit_depth);
       }
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
+        av1_upsample_intra_edge_high(left_col, n_px, bit_depth);
       }
     }
     highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                        upsample_above, upsample_left, p_angle, xd->bd);
+                        upsample_above, upsample_left, p_angle, bit_depth);
     return;
   }
 
   // predict
   if (mode == DC_PRED) {
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
-        dst, dst_stride, above_row, left_col, xd->bd);
+        dst, dst_stride, above_row, left_col, bit_depth);
   } else {
-    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
+    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
-                                   int ref_stride, uint8_t *dst, int dst_stride,
-                                   PREDICTION_MODE mode, int angle_delta,
-                                   FILTER_INTRA_MODE filter_intra_mode,
-                                   TX_SIZE tx_size, int disable_edge_filter,
-                                   int n_top_px, int n_topright_px,
-                                   int n_left_px, int n_bottomleft_px,
-                                   int plane) {
+static void build_intra_predictors(
+    const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
+    PREDICTION_MODE mode, int angle_delta, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
   int i;
   const uint8_t *above_ref = ref - ref_stride;
   const uint8_t *left_ref = ref - 1;
@@ -1462,33 +1458,32 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
     if (!disable_edge_filter) {
       const int need_right = p_angle < 90;
       const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
       if (p_angle != 90 && p_angle != 180) {
         const int ab_le = need_above_left ? 1 : 0;
         if (need_above && need_left && (txwpx + txhpx >= 24)) {
           filter_intra_edge_corner(above_row, left_col);
         }
         if (need_above && n_top_px > 0) {
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
           const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
           av1_filter_intra_edge(above_row - ab_le, n_px, strength);
         }
         if (need_left && n_left_px > 0) {
           const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
           const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
           av1_filter_intra_edge(left_col - ab_le, n_px, strength);
         }
       }
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
         av1_upsample_intra_edge(above_row, n_px);
       }
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
         av1_upsample_intra_edge(left_col, n_px);
@@ -1559,11 +1554,14 @@ static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
   return bs;
 }
 
-void av1_predict_intra_block(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
-    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
-    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
-    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+                             int enable_intra_edge_filter, int wpx, int hpx,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             int angle_delta, int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
@@ -1626,32 +1624,32 @@ void av1_predict_intra_block(
   }
 
   const int have_top_right =
-      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
+      has_top_right(sb_size, bsize, mi_row, mi_col, have_top, right_available,
                     partition, tx_size, row_off, col_off, ss_x, ss_y);
-  const int have_bottom_left =
-      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
-                      partition, tx_size, row_off, col_off, ss_x, ss_y);
+  const int have_bottom_left = has_bottom_left(
+      sb_size, bsize, mi_row, mi_col, bottom_available, have_left, partition,
+      tx_size, row_off, col_off, ss_x, ss_y);
 
-  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+  const int disable_edge_filter = !enable_intra_edge_filter;
+  const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     build_intra_predictors_high(
-        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
-        filter_intra_mode, tx_size, disable_edge_filter,
-        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode,
+        tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
         have_top_right ? AOMMIN(txwpx, xr) : 0,
         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+        have_bottom_left ? AOMMIN(txhpx, yd) : 0, intra_edge_filter_type,
+        xd->bd);
     return;
   }
 #endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
-                         angle_delta, filter_intra_mode, tx_size,
-                         disable_edge_filter,
-                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-                         have_top_right ? AOMMIN(txwpx, xr) : 0,
-                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-                         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+  build_intra_predictors(
+      ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode,
+      tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+      have_top_right ? AOMMIN(txwpx, xr) : 0,
+      have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+      have_bottom_left ? AOMMIN(txhpx, yd) : 0, intra_edge_filter_type);
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1669,6 +1667,7 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
           ? mbmi->filter_intra_mode_info.filter_intra_mode
           : FILTER_INTRA_MODES;
   const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+  const SequenceHeader *seq_params = cm->seq_params;
 
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
 #if CONFIG_DEBUG
@@ -1687,10 +1686,11 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
     CFL_CTX *const cfl = &xd->cfl;
     CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
     if (cfl->dc_pred_is_cached[pred_plane] == 0) {
-      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
-                              angle_delta, use_palette, filter_intra_mode, dst,
-                              dst_stride, dst, dst_stride, blk_col, blk_row,
-                              plane);
+      av1_predict_intra_block(xd, seq_params->sb_size,
+                              seq_params->enable_intra_edge_filter, pd->width,
+                              pd->height, tx_size, mode, angle_delta,
+                              use_palette, filter_intra_mode, dst, dst_stride,
+                              dst, dst_stride, blk_col, blk_row, plane);
       if (cfl->use_dc_pred_cache) {
         cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
         cfl->dc_pred_is_cached[pred_plane] = 1;
@@ -1701,9 +1701,10 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
     cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
     return;
   }
-  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
-                          angle_delta, use_palette, filter_intra_mode, dst,
-                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode,
+      dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 }
 
 void av1_init_intra_predictors(void) {
diff --git a/third_party/libaom/source/libaom/av1/common/reconintra.h b/third_party/libaom/source/libaom/av1/common/reconintra.h
index 907db5daf8..fa66ccd541 100644
--- a/third_party/libaom/source/libaom/av1/common/reconintra.h
+++ b/third_party/libaom/source/libaom/av1/common/reconintra.h
@@ -26,11 +26,14 @@ void av1_init_intra_predictors(void);
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int plane, int blk_col, int blk_row,
                                     TX_SIZE tx_size);
-void av1_predict_intra_block(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
-    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
-    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
-    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+                             int enable_intra_edge_filter, int wpx, int hpx,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             int angle_delta, int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane);
 
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
@@ -64,7 +67,7 @@ static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
 
 static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
                                                  BLOCK_SIZE bs) {
-  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+  if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
 
   return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
 }
diff --git a/third_party/libaom/source/libaom/av1/common/resize.c b/third_party/libaom/source/libaom/av1/common/resize.c
index 0cfb5a29b8..112a08a539 100644
--- a/third_party/libaom/source/libaom/av1/common/resize.c
+++ b/third_party/libaom/source/libaom/av1/common/resize.c
@@ -1263,7 +1263,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
                                 int plane, int rows) {
   const int is_uv = (plane > 0);
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
   const int upscaled_plane_width =
       ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
@@ -1305,11 +1305,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
     const int pad_right = (j == cm->tiles.cols - 1);
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
+    if (cm->seq_params->use_highbitdepth)
       highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
                                     dst_ptr, rows, dst_width, dst_stride,
                                     x_step_qn, x0_qn, pad_left, pad_right,
-                                    cm->seq_params.bit_depth);
+                                    cm->seq_params->bit_depth);
     else
       upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
                              rows, dst_width, dst_stride, x_step_qn, x0_qn,
@@ -1354,18 +1354,18 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(
   if (scaling_required) {
     const int num_planes = av1_num_planes(cm);
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8) {
+    if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
     } else {
       av1_resize_and_extend_frame_nonnormative(
-          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
     }
 #else
     if (use_optimized_scaler) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
     } else {
       av1_resize_and_extend_frame_nonnormative(
-          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
     }
 #endif
     return scaled;
@@ -1432,7 +1432,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   const int num_planes = av1_num_planes(cm);
   if (!av1_superres_scaled(cm)) return;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int byte_alignment = cm->features.byte_alignment;
 
   YV12_BUFFER_CONFIG copy_buffer;
@@ -1445,7 +1445,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
           &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           AOM_BORDER_IN_PIXELS, byte_alignment))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
   // Copy function assumes the frames are the same size.
@@ -1468,7 +1468,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
     if (release_fb_cb(cb_priv, fb)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
     }
     // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
@@ -1479,7 +1479,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
             AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, 0)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
     }
     unlock_buffer_pool(pool);
@@ -1495,7 +1495,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
             seq_params->subsampling_y, seq_params->use_highbitdepth,
             AOM_BORDER_IN_PIXELS, byte_alignment))
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
 
     // Restore config data back to frame_to_show
diff --git a/third_party/libaom/source/libaom/av1/common/restoration.c b/third_party/libaom/source/libaom/av1/common/restoration.c
index 41d0e22501..202953c889 100644
--- a/third_party/libaom/source/libaom/av1/common/restoration.c
+++ b/third_party/libaom/source/libaom/av1/common/restoration.c
@@ -42,8 +42,8 @@ const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
   AV1PixelRect rect;
 
-  int ss_x = is_uv && cm->seq_params.subsampling_x;
-  int ss_y = is_uv && cm->seq_params.subsampling_y;
+  int ss_x = is_uv && cm->seq_params->subsampling_x;
+  int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   rect.top = 0;
   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
@@ -1107,7 +1107,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
                                             YV12_BUFFER_CONFIG *frame,
                                             AV1_COMMON *cm, int optimized_lr,
                                             int num_planes) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int bit_depth = seq_params->bit_depth;
   const int highbd = seq_params->use_highbitdepth;
   lr_ctxt->dst = &cm->rst_frame;
@@ -1118,7 +1118,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
           cm->features.byte_alignment, NULL, NULL, NULL, 0) < 0)
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
   lr_ctxt->on_rest_unit = filter_frame_on_unit;
@@ -1299,7 +1299,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   const RestorationInfo *rsi = &cm->rst_info[plane];
 
@@ -1315,7 +1315,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int *rrow1) {
   assert(rcol0 && rcol1 && rrow0 && rrow1);
 
-  if (bsize != cm->seq_params.sb_size) return 0;
+  if (bsize != cm->seq_params->sb_size) return 0;
   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
 
   assert(!cm->features.all_lossless);
@@ -1345,8 +1345,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
 
   // The size of an MI-unit on this plane of the image
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int mi_size_x = MI_SIZE >> ss_x;
   const int mi_size_y = MI_SIZE >> ss_y;
 
@@ -1427,7 +1427,7 @@ static void save_deblock_boundary_lines(
   int upscaled_width;
   int line_bytes;
   if (av1_superres_scaled(cm)) {
-    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int ss_x = is_uv && cm->seq_params->subsampling_x;
     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
     line_bytes = upscaled_width << use_highbd;
     if (use_highbd)
@@ -1474,7 +1474,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   // At the point where this function is called, we've already applied
   // superres. So we don't need to extend the lines here, we can just
   // pull directly from the topmost row of the upscaled frame.
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   const int upscaled_width = av1_superres_scaled(cm)
                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
                                  : src_width;
@@ -1494,7 +1494,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                          int use_highbd, int plane,
                                          AV1_COMMON *cm, int after_cdef) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
 
@@ -1559,7 +1559,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                               AV1_COMMON *cm, int after_cdef) {
   const int num_planes = av1_num_planes(cm);
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   for (int p = 0; p < num_planes; ++p) {
     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   }
diff --git a/third_party/libaom/source/libaom/av1/common/thread_common.c b/third_party/libaom/source/libaom/av1/common/thread_common.c
index 638dc4c951..0c45749de1 100644
--- a/third_party/libaom/source/libaom/av1/common/thread_common.c
+++ b/third_party/libaom/source/libaom/av1/common/thread_common.c
@@ -152,6 +152,61 @@ static void loop_filter_data_reset(LFWorkerData *lf_data,
   }
 }
 
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+                         int num_workers) {
+  if (num_workers < 1) return;
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
+    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+  }
+#else
+  (void)cm;
+  (void)cdef_sync;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
+  if (cdef_sync == NULL) return;
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(cdef_sync->mutex_);
+    aom_free(cdef_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
+                                         int row) {
+  if (!row) return;
+#if CONFIG_MULTITHREAD
+  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+  pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_);
+  while (cdef_row_mt[row - 1].is_row_done != 1)
+    pthread_cond_wait(cdef_row_mt[row - 1].row_cond_,
+                      cdef_row_mt[row - 1].row_mutex_);
+  cdef_row_mt[row - 1].is_row_done = 0;
+  pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_);
+#else
+  (void)cdef_sync;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
+                                          int row) {
+#if CONFIG_MULTITHREAD
+  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+  pthread_mutex_lock(cdef_row_mt[row].row_mutex_);
+  pthread_cond_signal(cdef_row_mt[row].row_cond_);
+  cdef_row_mt[row].is_row_done = 1;
+  pthread_mutex_unlock(cdef_row_mt[row].row_mutex_);
+#else
+  (void)cdef_sync;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
 static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
                              int plane) {
 #if CONFIG_MULTITHREAD
@@ -211,7 +266,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
 #if CONFIG_LPF_MASK
                             int is_decoding,
 #endif
-                            int plane_start, int plane_end) {
+                            int plane_start, int plane_end, int is_realtime) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
@@ -238,6 +293,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
         lf_job_queue->mi_row = mi_row;
         lf_job_queue->plane = plane;
         lf_job_queue->dir = dir;
+        lf_job_queue->is_realtime = is_realtime;
         lf_job_queue++;
         lf_sync->jobs_enqueued++;
       }
@@ -272,7 +328,7 @@ static INLINE void thread_loop_filter_rows(
   const int sb_cols =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
       MAX_MIB_SIZE_LOG2;
-  int mi_row, mi_col, plane, dir;
+  int mi_row, mi_col, plane, dir, is_realtime;
   int r, c;
 
   while (1) {
@@ -283,17 +339,29 @@ static INLINE void thread_loop_filter_rows(
       plane = cur_job_info->plane;
       dir = cur_job_info->dir;
       r = mi_row >> MAX_MIB_SIZE_LOG2;
+      is_realtime = cur_job_info->is_realtime && !plane;
 
       if (dir == 0) {
         for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
              mi_col += MAX_MIB_SIZE) {
           c = mi_col >> MAX_MIB_SIZE_LOG2;
 
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+          av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
-
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime) {
+            av1_filter_block_plane_vert_rt(cm, xd, plane, &planes[plane],
+                                           mi_row, mi_col);
+
+          } else {
+            av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                        mi_col);
+          }
+#endif
           sync_write(lf_sync, r, c, sb_cols, plane);
         }
       } else if (dir == 1) {
@@ -309,10 +377,21 @@ static INLINE void thread_loop_filter_rows(
           // completed
           sync_read(lf_sync, r + 1, c, plane);
 
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+          av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime) {
+            av1_filter_block_plane_horz_rt(cm, xd, plane, &planes[plane],
+                                           mi_row, mi_col);
+          } else {
+            av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                        mi_col);
+          }
+#endif
         }
       }
     } else {
@@ -405,7 +484,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 int is_decoding,
 #endif
                                 AVxWorker *workers, int nworkers,
-                                AV1LfSync *lf_sync) {
+                                AV1LfSync *lf_sync, int is_realtime) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 #if CONFIG_LPF_MASK
   int sb_rows;
@@ -441,7 +520,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
 #if CONFIG_LPF_MASK
                   is_decoding,
 #endif
-                  plane_start, plane_end);
+                  plane_start, plane_end, is_realtime);
 
   // Set up loopfilter thread data.
   for (i = num_workers - 1; i >= 0; --i) {
@@ -484,7 +563,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               int is_decoding,
 #endif
                               AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync) {
+                              AV1LfSync *lf_sync, int is_realtime) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -512,7 +591,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
 
       // TODO(chengchen): can we remove this?
       struct macroblockd_plane *pd = xd->plane;
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
+      av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame, 0, 0, plane,
                            plane + 1);
 
       av1_build_bitmask_vert_info(cm, &pd[plane], plane);
@@ -526,7 +605,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   }
 #else
   loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                      plane_end, workers, num_workers, lf_sync);
+                      plane_end, workers, num_workers, lf_sync, is_realtime);
 #endif
 }
 
@@ -720,7 +799,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
-    const int ss_y = is_uv && cm->seq_params.subsampling_y;
+    const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
     AV1PixelRect tile_rect = ctxt[plane].tile_rect;
     const int unit_size = ctxt[plane].rsi->restoration_unit_size;
@@ -932,3 +1011,198 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                  cm);
 }
 #endif
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
+  cdef_sync->end_of_frame = 0;
+  cdef_sync->fbr = 0;
+  cdef_sync->fbc = 0;
+}
+
+static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
+                                           int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers,
+                                         AV1_COMMON *const cm,
+                                         int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int had_error = 0;
+
+  // Wait for completion of Cdef frame.
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    had_error |= !winterface->sync(worker);
+  }
+  if (had_error)
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to process cdef frame");
+}
+
+// Updates the row index of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all rows is complete.
+static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync,
+                                          const int nvfb) {
+  cdef_sync->fbr++;
+  if (cdef_sync->fbr == nvfb) {
+    cdef_sync->end_of_frame = 1;
+  }
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
+                                            int *cur_fbr, const int nvfb) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  int do_next_row = 0;
+  // Populates information needed for current job and update the row
+  // index of the next row to be processed.
+  if (cdef_sync->end_of_frame == 0) {
+    do_next_row = 1;
+    *cur_fbr = cdef_sync->fbr;
+    update_cdef_row_next_job_info(cdef_sync, nvfb);
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  return do_next_row;
+}
+
+// Hook function for each thread in CDEF multi-threading.
+static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
+  AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
+  const int nvfb =
+      (cdef_worker->cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int cur_fbr;
+  while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
+    av1_cdef_fb_row(cdef_worker->cm, cdef_worker->xd, cdef_worker->linebuf,
+                    cdef_worker->colbuf, cdef_worker->srcbuf, cur_fbr,
+                    cdef_worker->cdef_init_fb_row_fn, cdef_sync);
+  }
+  return 1;
+}
+
+// Assigns CDEF hook function and thread data to each worker.
+static void prepare_cdef_frame_workers(
+    AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker,
+    AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+    int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  const int num_planes = av1_num_planes(cm);
+
+  cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
+  for (int plane = 0; plane < num_planes; plane++)
+    cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane];
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    cdef_worker[i].cm = cm;
+    cdef_worker[i].xd = xd;
+    cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
+    for (int plane = 0; plane < num_planes; plane++)
+      cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
+
+    worker->hook = hook;
+    worker->data1 = cdef_sync;
+    worker->data2 = &cdef_worker[i];
+  }
+}
+
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+                             const MACROBLOCKD *const xd,
+                             CdefBlockInfo *const fb_info,
+                             uint16_t **const linebuf, uint16_t *const src,
+                             struct AV1CdefSyncData *const cdef_sync, int fbr) {
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+
+  // for the current filter block, it's top left corner mi structure (mi_tl)
+  // is first accessed to check whether the top and left boundaries are
+  // frame boundaries. Then bottom-left and top-right mi structures are
+  // accessed to check whether the bottom and right boundaries
+  // (respectively) are frame boundaries.
+  //
+  // Note that we can't just check the bottom-right mi structure - eg. if
+  // we're at the right-hand edge of the frame but not the bottom, then
+  // the bottom-right mi is NULL but the bottom-left is not.
+  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
+  if (fbr != nvfb - 1)
+    fb_info->frame_boundary[BOTTOM] =
+        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
+  else
+    fb_info->frame_boundary[BOTTOM] = 1;
+
+  fb_info->src = src;
+  fb_info->damping = cm->cdef_info.cdef_damping;
+  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  av1_zero(fb_info->dir);
+  av1_zero(fb_info->var);
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+    uint16_t *top_linebuf = &linebuf[plane][0];
+    uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride];
+    {
+      const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+      const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+      const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+
+      if (fbr != nvfb - 1)  // if (fbr != 0)  // top line buffer copy
+        av1_cdef_copy_sb8_16(
+            cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride,
+            xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0,
+            xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+      if (fbr != nvfb - 1)  // bottom line buffer copy
+        av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride],
+                             stride, xd->plane[plane].dst.buf, bot_offset, 0,
+                             xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+    }
+
+    fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride];
+    fb_info->bot_linebuf[plane] =
+        &linebuf[plane]
+                [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
+  }
+
+  cdef_row_mt_sync_write(cdef_sync, fbr);
+  cdef_row_mt_sync_read(cdef_sync, fbr);
+}
+
+// Implements multi-threading for CDEF.
+// Perform CDEF on input frame.
+// Inputs:
+//   frame: Pointer to input frame buffer.
+//   cm: Pointer to common structure.
+//   xd: Pointer to common current coding block structure.
+// Returns:
+//   Nothing will be returned.
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                       AV1CdefWorkerData *const cdef_worker,
+                       AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+                       int num_workers,
+                       cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+
+  reset_cdef_job_info(cdef_sync);
+  prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
+                             workers, cdef_sync, num_workers,
+                             cdef_init_fb_row_fn);
+  launch_cdef_workers(workers, num_workers);
+  sync_cdef_workers(workers, cm, num_workers);
+}
diff --git a/third_party/libaom/source/libaom/av1/common/thread_common.h b/third_party/libaom/source/libaom/av1/common/thread_common.h
index 97b8abcff6..bcb4b879c1 100644
--- a/third_party/libaom/source/libaom/av1/common/thread_common.h
+++ b/third_party/libaom/source/libaom/av1/common/thread_common.h
@@ -15,6 +15,7 @@
 #include "config/aom_config.h"
 
 #include "av1/common/av1_loopfilter.h"
+#include "av1/common/cdef.h"
 #include "aom_util/aom_thread.h"
 
 #ifdef __cplusplus
@@ -27,6 +28,7 @@ typedef struct AV1LfMTInfo {
   int mi_row;
   int plane;
   int dir;
+  int is_realtime;
 } AV1LfMTInfo;
 
 // Loopfilter row synchronization
@@ -97,6 +99,55 @@ typedef struct AV1LrSyncData {
   int jobs_dequeued;
 } AV1LrSync;
 
+typedef struct AV1CdefWorker {
+  AV1_COMMON *cm;
+  MACROBLOCKD *xd;
+  uint16_t *colbuf[MAX_MB_PLANE];
+  uint16_t *srcbuf;
+  uint16_t *linebuf[MAX_MB_PLANE];
+  cdef_init_fb_row_t cdef_init_fb_row_fn;
+} AV1CdefWorkerData;
+
+typedef struct AV1CdefRowSync {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mutex_;
+  pthread_cond_t *row_cond_;
+#endif  // CONFIG_MULTITHREAD
+  int is_row_done;
+} AV1CdefRowSync;
+
+// Data related to CDEF search multi-thread synchronization.
+typedef struct AV1CdefSyncData {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Data related to CDEF row mt sync information
+  AV1CdefRowSync *cdef_row_mt;
+  // Flag to indicate all blocks are processed and end of frame is reached
+  int end_of_frame;
+  // Row index in units of 64x64 block
+  int fbr;
+  // Column index in units of 64x64 block
+  int fbc;
+} AV1CdefSync;
+
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                       AV1CdefWorkerData *const cdef_worker,
+                       AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+                       int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+                             const MACROBLOCKD *const xd,
+                             CdefBlockInfo *const fb_info,
+                             uint16_t **const linebuf, uint16_t *const src,
+                             struct AV1CdefSyncData *const cdef_sync, int fbr);
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+                          int dstride, const uint8_t *src, int src_voffset,
+                          int src_hoffset, int sstride, int vsize, int hsize);
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+                         int num_workers);
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync);
+
 // Deallocate loopfilter synchronization related mutex and data.
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
 
@@ -107,7 +158,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                               int is_decoding,
 #endif
                               AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync);
+                              AV1LfSync *lf_sync, int is_realtime);
 
 #if !CONFIG_REALTIME_ONLY
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/third_party/libaom/source/libaom/av1/common/tile_common.c b/third_party/libaom/source/libaom/av1/common/tile_common.c
index 1b11bd7606..8f5d2a6316 100644
--- a/third_party/libaom/source/libaom/av1/common/tile_common.c
+++ b/third_party/libaom/source/libaom/av1/common/tile_common.c
@@ -28,7 +28,7 @@ static int tile_log2(int blk_size, int target) {
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   const int mi_cols =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
@@ -130,9 +130,9 @@ void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
   assert(row < cm->tiles.rows);
   int mi_row_start = cm->tiles.row_start_sb[row]
-                     << cm->seq_params.mib_size_log2;
+                     << cm->seq_params->mib_size_log2;
   int mi_row_end = cm->tiles.row_start_sb[row + 1]
-                   << cm->seq_params.mib_size_log2;
+                   << cm->seq_params->mib_size_log2;
   tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
   tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
@@ -142,9 +142,9 @@ void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(col < cm->tiles.cols);
   int mi_col_start = cm->tiles.col_start_sb[col]
-                     << cm->seq_params.mib_size_log2;
+                     << cm->seq_params->mib_size_log2;
   int mi_col_end = cm->tiles.col_start_sb[col + 1]
-                   << cm->seq_params.mib_size_log2;
+                   << cm->seq_params->mib_size_log2;
   tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
   tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
@@ -153,16 +153,16 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
 
 int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
   int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params->mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params->mib_size_log2;
 
   return sb_rows;
 }
 
 int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
   int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params->mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params->mib_size_log2;
 
   return sb_cols;
 }
@@ -195,8 +195,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
   r.bottom = AOMMIN(r.bottom, frame_h);
 
   // Convert to coordinates in the appropriate plane
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
   r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
@@ -215,7 +215,7 @@ void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
     for (int i = 0; i < tiles->cols; ++i) {
       const int tile_width_sb =
           tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
-      const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+      const int tile_w = tile_width_sb * cm->seq_params->mib_size;
       assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
       *w = tile_w;
     }
@@ -223,7 +223,7 @@ void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
     for (int i = 0; i < tiles->rows; ++i) {
       const int tile_height_sb =
           tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
-      const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+      const int tile_h = tile_height_sb * cm->seq_params->mib_size;
       assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
       *h = tile_h;
     }
diff --git a/third_party/libaom/source/libaom/av1/decoder/decodeframe.c b/third_party/libaom/source/libaom/av1/decoder/decodeframe.c
index b364714e0a..9ca7d3cd35 100644
--- a/third_party/libaom/source/libaom/av1/decoder/decodeframe.c
+++ b/third_party/libaom/source/libaom/av1/decoder/decodeframe.c
@@ -76,12 +76,11 @@
 // Checks that the remaining bits start with a 1 and ends with 0s.
 // It consumes an additional byte, if already byte aligned before the check.
 int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *const cm = &pbi->common;
   // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
   int bits_before_alignment = 8 - rb->bit_offset % 8;
   int trailing = aom_rb_read_literal(rb, bits_before_alignment);
   if (trailing != (1 << (bits_before_alignment - 1))) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
   return 0;
@@ -304,16 +303,18 @@ static AOM_INLINE void decode_reconstruct_tx(
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int sub_step = bsw * bsh;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
 
     assert(bsw > 0 && bsh > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
 
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
         decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
                               offsetc, block, sub_txs, eob_total);
         block += sub_step;
@@ -362,7 +363,7 @@ static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
                                          PARTITION_TYPE partition,
                                          BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
@@ -914,6 +915,16 @@ static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
           if (plane && !xd->is_chroma_ref) break;
           const struct macroblockd_plane *const pd = &xd->plane[plane];
           const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+#if CONFIG_REALTIME_ONLY
+          // Realtime only build doesn't support 4x rectangular txfm sizes.
+          if (tx_size == TX_4X16 || tx_size == TX_16X4 || tx_size == TX_8X32 ||
+              tx_size == TX_32X8 || tx_size == TX_16X64 ||
+              tx_size == TX_64X16) {
+            aom_internal_error(
+                xd->error_info, AOM_CODEC_UNSUP_FEATURE,
+                "Realtime only build doesn't support rectangular txfm sizes");
+          }
+#endif
           const int stepr = tx_size_high_unit[tx_size];
           const int stepc = tx_size_wide_unit[tx_size];
 
@@ -1219,9 +1230,9 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
                                       : (j == 1 ? quant_params->u_ac_delta_q
                                                 : quant_params->v_ac_delta_q);
         xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
-            current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+            current_qindex, dc_delta_q, cm->seq_params->bit_depth);
         xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
-            current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+            current_qindex, ac_delta_q, cm->seq_params->bit_depth);
       }
     }
   }
@@ -1554,9 +1565,9 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
     }
   }
   if (!all_none) {
-    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-           cm->seq_params.sb_size == BLOCK_128X128);
-    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
 
     for (int p = 0; p < num_planes; ++p)
       cm->rst_info[p].restoration_unit_size = sb_size;
@@ -1576,7 +1587,8 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
     if (s && !chroma_none) {
       cm->rst_info[1].restoration_unit_size =
           cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
@@ -1847,7 +1859,7 @@ static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
 // Build y/uv dequant values based on segmentation.
 static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
                                                   MACROBLOCKD *const xd) {
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
   const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
@@ -1909,7 +1921,7 @@ static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) return;
 
   if (aom_rb_read_bit(rb)) {
@@ -1930,7 +1942,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
                                               int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Dimensions of %dx%d beyond allowed size of %dx%d.",
                        width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 #endif
@@ -1950,7 +1962,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
         // consistent and to force a realloc next time.
         cm->width = 0;
         cm->height = 0;
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
       }
     } else {
@@ -1968,7 +1980,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
 
 static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
   BufferPool *const pool = cm->buffer_pool;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
@@ -1978,7 +1990,7 @@ static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
           &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv,
           0)) {
     unlock_buffer_pool(pool);
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
   unlock_buffer_pool(pool);
@@ -1999,7 +2011,7 @@ static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
 static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
                                         int frame_size_override_flag,
                                         struct aom_read_bit_buffer *rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   int width, height;
 
   if (frame_size_override_flag) {
@@ -2008,7 +2020,7 @@ static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
     av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
     if (width > seq_params->max_frame_width ||
         height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Frame dimensions are larger than the maximum values");
     }
   } else {
@@ -2049,7 +2061,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
       // the middle of a stream, and static analysis will error if we don't do
       // a null check here.
       if (ref_buf == NULL) {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Invalid condition: invalid reference buffer");
       } else {
         const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
@@ -2065,7 +2077,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
     }
   }
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!found) {
     int num_bits_width = seq_params->num_bits_width;
     int num_bits_height = seq_params->num_bits_height;
@@ -2077,7 +2089,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
   }
 
   if (width <= 0 || height <= 0)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Invalid frame size");
 
   // Check to make sure at least one of frames that this frame references
@@ -2089,7 +2101,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
                              ref_frame->buf.y_crop_height, width, height);
   }
   if (!has_valid_ref_frame)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
   for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
@@ -2097,7 +2109,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
             ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
             ref_frame->buf.subsampling_y, seq_params->bit_depth,
             seq_params->subsampling_x, seq_params->subsampling_y))
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
   setup_buffer_pool(cm);
@@ -2117,7 +2129,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
 
 static AOM_INLINE void read_tile_info_max_tile(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   int width_mi =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
@@ -2213,7 +2225,7 @@ static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
     pbi->context_update_tile_id =
         aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
     if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid context_update_tile_id");
     }
     // tile size magnitude
@@ -2366,7 +2378,7 @@ static const uint8_t *get_ls_tile_buffers(
 
       // Get the whole of the last column, otherwise stop at the required tile.
       for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
-        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
                            tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
@@ -2378,7 +2390,7 @@ static const uint8_t *get_ls_tile_buffers(
       data = tile_col_data_end[c - 1];
 
       for (int r = 0; r < tile_rows; ++r) {
-        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
                            tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
@@ -2446,11 +2458,11 @@ static AOM_INLINE void get_tile_buffers(
       if (tc < start_tile || tc > end_tile) continue;
 
       if (data + hdr_offset >= data_end)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Data ended before all tiles were read.");
       data += hdr_offset;
-      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
-                      &pbi->common.error, &data, buf);
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error,
+                      &data, buf);
     }
   }
 }
@@ -2460,7 +2472,7 @@ static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
                                      const int num_planes, int mi_row,
                                      int mi_col) {
   AV1_COMMON *const cm = &pbi->common;
-  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int mib_size_log2 = cm->seq_params->mib_size_log2;
   int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
   int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   CB_BUFFER *cb_buffer = cb_buffer_base + offset;
@@ -2629,11 +2641,11 @@ static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
       pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
   const int sb_row_in_tile =
-      (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+      (mi_row - tile_info.mi_row_start) >> cm->seq_params->mib_size_log2;
   int sb_col_in_tile = 0;
 
   for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-       mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+       mi_col += cm->seq_params->mib_size, sb_col_in_tile++) {
     set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
                   mi_col);
 
@@ -2641,7 +2653,7 @@ static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
 
     // Decoding of the super-block
     decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                     cm->seq_params.sb_size, 0x2);
+                     cm->seq_params->sb_size, 0x2);
 
     sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
                sb_cols_in_tile);
@@ -2711,16 +2723,16 @@ static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
   av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
+         mi_col += cm->seq_params->mib_size) {
       set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 
       // Bit-stream parsing and decoding of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x3);
+                       cm->seq_params->sb_size, 0x3);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
         aom_merge_corrupted_flag(&dcb->corrupted, 1);
@@ -2801,6 +2813,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     decoder_alloc_tile_data(pbi, n_tiles);
   }
+  if (pbi->dcb.xd.seg_mask == NULL)
+    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+                    (uint8_t *)aom_memalign(
+                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 #if CONFIG_ACCOUNTING
   if (pbi->acct_enabled) {
     aom_accounting_reset(&pbi->accounting);
@@ -2837,7 +2853,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
       av1_tile_init(&td->dcb.xd.tile, cm, row, col);
       td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
       setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
-                         &cm->error, td->bit_reader, allow_update_cdf);
+                         &pbi->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
       if (pbi->acct_enabled) {
         td->bit_reader->accounting = &pbi->accounting;
@@ -2859,7 +2875,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
       decode_tile(pbi, td, row, col);
       aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
       if (pbi->dcb.corrupted)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Failed to decode tile data");
     }
   }
@@ -3017,7 +3033,7 @@ static int get_next_job_info(AV1Decoder *const pbi,
   const int tile_cols_end = frame_row_mt_info->tile_cols_end;
   const int start_tile = frame_row_mt_info->start_tile;
   const int end_tile = frame_row_mt_info->end_tile;
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   int num_mis_to_decode, num_threads_working;
   int num_mis_waiting_for_decode;
   int min_threads_working = INT_MAX;
@@ -3135,7 +3151,7 @@ static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
 static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
                                          TileDataDec *const tile_data) {
   AV1_COMMON *const cm = &pbi->common;
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int num_planes = av1_num_planes(cm);
   TileInfo tile_info = tile_data->tile_info;
   int tile_row = tile_info.tile_row;
@@ -3148,16 +3164,16 @@ static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
   av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
+         mi_col += cm->seq_params->mib_size) {
       set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 
       // Bit-stream parsing of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x1);
+                       cm->seq_params->sb_size, 0x1);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
         aom_merge_corrupted_flag(&dcb->corrupted, 1);
@@ -3357,6 +3373,8 @@ void av1_free_mc_tmp_buf(ThreadData *thread_data) {
 
   aom_free(thread_data->tmp_conv_dst);
   thread_data->tmp_conv_dst = NULL;
+  aom_free(thread_data->seg_mask);
+  thread_data->seg_mask = NULL;
   for (int i = 0; i < 2; ++i) {
     aom_free(thread_data->tmp_obmc_bufs[i]);
     thread_data->tmp_obmc_bufs[i] = NULL;
@@ -3389,6 +3407,10 @@ static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
   CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
                   aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
                                        sizeof(*thread_data->tmp_conv_dst)));
+  CHECK_MEM_ERROR(cm, thread_data->seg_mask,
+                  (uint8_t *)aom_memalign(
+                      16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask)));
+
   for (int i = 0; i < 2; ++i) {
     CHECK_MEM_ERROR(
         cm, thread_data->tmp_obmc_bufs[i],
@@ -3411,6 +3433,8 @@ static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
     thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
     thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
     thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    if (worker_idx)
+      thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask;
     for (int j = 0; j < 2; ++j) {
       thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
           thread_data->td->tmp_obmc_bufs[j];
@@ -3481,7 +3505,7 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
       winterface->init(worker);
       worker->thread_name = "aom tile worker";
       if (worker_idx != 0 && !winterface->reset(worker)) {
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
 
@@ -3498,7 +3522,7 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
       thread_data->error_info.setjmp = 0;
     }
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) {
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
@@ -3590,6 +3614,10 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     decoder_alloc_tile_data(pbi, n_tiles);
   }
+  if (pbi->dcb.xd.seg_mask == NULL)
+    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+                    (uint8_t *)aom_memalign(
+                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
@@ -3606,7 +3634,7 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   sync_dec_workers(pbi, num_workers);
 
   if (pbi->dcb.corrupted)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
   if (tiles->large_scale) {
@@ -3624,8 +3652,8 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
 
 static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
-  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
 
   if (pbi->cb_buffer_alloc_size < size) {
     av1_dec_free_cb_buf(pbi);
@@ -3669,10 +3697,10 @@ static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
       tile_data->dec_row_mt_sync.num_threads_working = 0;
       tile_data->dec_row_mt_sync.mi_rows =
           ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
-                             cm->seq_params.mib_size_log2);
+                             cm->seq_params->mib_size_log2);
       tile_data->dec_row_mt_sync.mi_cols =
           ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
-                             cm->seq_params.mib_size_log2);
+                             cm->seq_params->mib_size_log2);
 
       frame_row_mt_info->mi_rows_to_decode +=
           tile_data->dec_row_mt_sync.mi_rows;
@@ -3776,6 +3804,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
     }
     decoder_alloc_tile_data(pbi, n_tiles);
   }
+  if (pbi->dcb.xd.seg_mask == NULL)
+    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+                    (uint8_t *)aom_memalign(
+                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
@@ -3811,7 +3843,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
   sync_dec_workers(pbi, num_workers);
 
   if (pbi->dcb.corrupted)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
   if (tiles->large_scale) {
@@ -3829,7 +3861,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
 
 static AOM_INLINE void error_handler(void *data) {
   AV1_COMMON *const cm = (AV1_COMMON *)data;
-  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+  aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
 // Reads the high_bitdepth and twelve_bit fields in color_config() and sets
@@ -3860,7 +3892,7 @@ static AOM_INLINE void read_bitdepth(
 void av1_read_film_grain_params(AV1_COMMON *cm,
                                 struct aom_read_bit_buffer *rb) {
   aom_film_grain_t *pars = &cm->film_grain_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
 
   pars->apply_grain = aom_rb_read_bit(rb);
   if (!pars->apply_grain) {
@@ -3890,7 +3922,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
       }
     }
     if (!found) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid film grain reference idx %d. ref_frame_idx = "
                          "{%d, %d, %d, %d, %d, %d, %d}",
                          film_grain_params_ref_idx, cm->remapped_ref_idx[0],
@@ -3900,11 +3932,11 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
     }
     RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
     if (buf == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid Film grain reference idx");
     }
     if (!buf->film_grain_params_present) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Film grain reference parameters not available");
     }
     uint16_t random_seed = pars->random_seed;
@@ -3916,13 +3948,13 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
   // Scaling functions parameters
   pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
   if (pars->num_y_points > 14)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                        "Number of points for film grain luma scaling function "
                        "exceeds the maximum value.");
   for (int i = 0; i < pars->num_y_points; i++) {
     pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
     if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "First coordinate of the scaling function points "
                          "shall be increasing.");
     pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
@@ -3941,14 +3973,14 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
   } else {
     pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
     if (pars->num_cb_points > 10)
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Number of points for film grain cb scaling function "
                          "exceeds the maximum value.");
     for (int i = 0; i < pars->num_cb_points; i++) {
       pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
       if (i &&
           pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "First coordinate of the scaling function points "
                            "shall be increasing.");
       pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
@@ -3956,14 +3988,14 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 
     pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
     if (pars->num_cr_points > 10)
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Number of points for film grain cr scaling function "
                          "exceeds the maximum value.");
     for (int i = 0; i < pars->num_cr_points; i++) {
       pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
       if (i &&
           pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "First coordinate of the scaling function points "
                            "shall be increasing.");
       pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
@@ -3972,7 +4004,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
     if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
         (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
          ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "In YCbCr 4:2:0, film grain shall be applied "
                          "to both chroma components or neither.");
   }
@@ -4024,13 +4056,13 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 
 static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
                                        struct aom_read_bit_buffer *rb) {
-  if (cm->seq_params.film_grain_params_present &&
+  if (cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     av1_read_film_grain_params(cm, rb);
   } else {
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-  cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+  cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
   memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
          sizeof(aom_film_grain_t));
 }
@@ -4164,7 +4196,7 @@ void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
 static AOM_INLINE void read_temporal_point_info(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
   cm->frame_presentation_time = aom_rb_read_unsigned_literal(
-      rb, cm->seq_params.decoder_model_info.frame_presentation_time_length);
+      rb, cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
 void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
@@ -4192,7 +4224,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
     seq_params->frame_id_length =
         aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
     if (seq_params->frame_id_length > 16)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid frame_id_length");
   }
 
@@ -4446,7 +4478,7 @@ static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
 static int read_uncompressed_header(AV1Decoder *pbi,
                                     struct aom_read_bit_buffer *rb) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &pbi->dcb.xd;
@@ -4457,7 +4489,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   sframe_info->is_s_frame_at_altref = 0;
 
   if (!pbi->sequence_header_ready) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "No sequence header");
   }
 
@@ -4479,14 +4511,14 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     if (cm->show_existing_frame) {
       if (pbi->sequence_header_changed) {
         aom_internal_error(
-            &cm->error, AOM_CODEC_CORRUPT_FRAME,
+            &pbi->error, AOM_CODEC_CORRUPT_FRAME,
             "New sequence header starts with a show_existing_frame.");
       }
       // Show an existing frame directly.
       const int existing_frame_idx = aom_rb_read_literal(rb, 3);
       RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
       if (frame_to_show == NULL) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Buffer does not contain a decoded frame");
       }
       if (seq_params->decoder_model_info_present_flag &&
@@ -4500,7 +4532,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
          * referencing */
         if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
             pbi->valid_for_referencing[existing_frame_idx] == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference buffer frame ID mismatch");
       }
       lock_buffer_pool(pool);
@@ -4526,7 +4558,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       // show_existing_frame is used to show a previous frame, that the value
       // of showable_frame for the previous frame was equal to 1.
       if (!frame_to_show->showable_frame) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Buffer does not contain a showable frame");
       }
       // Section 6.8.2: It is a requirement of bitstream conformance that when
@@ -4554,7 +4586,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         pbi->decoding_first_frame = 1;
         reset_frame_buffers(cm);
       } else {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Sequence header has changed without a keyframe.");
       }
     }
@@ -4569,7 +4601,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     }
     if (seq_params->still_picture &&
         (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Still pictures must be coded as shown keyframes");
     }
     cm->showable_frame = current_frame->frame_type != KEY_FRAME;
@@ -4641,7 +4673,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         /* Check current_frame_id for conformance */
         if (prev_frame_id == cm->current_frame_id ||
             diff_frame_id >= (1 << (frame_id_length - 1))) {
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Invalid value of current_frame_id");
         }
       }
@@ -4672,18 +4704,18 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   }
 
   if (seq_params->decoder_model_info_present_flag) {
-    cm->buffer_removal_time_present = aom_rb_read_bit(rb);
-    if (cm->buffer_removal_time_present) {
+    pbi->buffer_removal_time_present = aom_rb_read_bit(rb);
+    if (pbi->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
-          if ((((seq_params->operating_point_idc[op_num] >>
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              (((seq_params->operating_point_idc[op_num] >>
                  cm->temporal_layer_id) &
                 0x1) &&
                ((seq_params->operating_point_idc[op_num] >>
                  (cm->spatial_layer_id + 8)) &
-                0x1)) ||
-              seq_params->operating_point_idc[op_num] == 0) {
+                0x1))) {
             cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
                 rb, seq_params->decoder_model_info.buffer_removal_time_length);
           } else {
@@ -4713,7 +4745,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
       if (current_frame->refresh_frame_flags == 0xFF) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Intra only frames cannot have refresh flags 0xFF");
       }
       if (pbi->need_resync) {
@@ -4747,7 +4779,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           // pixels set to neutral grey.
           int buf_idx = get_free_fb(cm);
           if (buf_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
           }
           buf = &frame_bufs[buf_idx];
@@ -4760,7 +4792,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
                   &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0)) {
             decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
           unlock_buffer_pool(pool);
@@ -4827,10 +4859,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         // reference to a slot that hasn't been set yet. That's what we are
         // checking here.
         if (lst_buf == NULL)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
         if (gld_buf == NULL)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
 
         av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
@@ -4848,7 +4880,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           // reference to a slot that hasn't been set yet. That's what we are
           // checking here.
           if (cm->ref_frame_map[ref] == NULL)
-            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                                "Inter frame requests nonexistent reference");
           cm->remapped_ref_idx[i] = ref;
         } else {
@@ -4856,7 +4888,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         }
         // Check valid for referencing
         if (pbi->valid_for_referencing[ref] == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference frame not valid for referencing");
 
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
@@ -4872,7 +4904,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           // Compare values derived from delta_frame_id_minus_1 and
           // refresh_frame_flags.
           if (ref_frame_id != cm->ref_frame_id[ref])
-            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                                "Reference buffer frame ID mismatch");
         }
       }
@@ -4895,7 +4927,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->prev_frame = get_primary_ref_frame_buf(cm);
     if (features->primary_ref_frame != PRIMARY_REF_NONE &&
         get_primary_ref_frame_buf(cm) == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Reference frame containing this frame's initial "
                          "frame context is unavailable.");
     }
@@ -4915,7 +4947,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
             ref_scale_factors, ref_buf->buf.y_crop_width,
             ref_buf->buf.y_crop_height, cm->width, cm->height);
         if ((!av1_is_valid_scale(ref_scale_factors)))
-          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "Reference frame has invalid dimensions");
       }
     }
@@ -4952,7 +4984,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   cm->cur_frame->buf.render_height = cm->render_height;
 
   if (pbi->need_resync) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Keyframe / intra-only frame required to reset decoder"
                        " state");
   }
@@ -4973,13 +5005,13 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
   read_tile_info(pbi, rb);
   if (!av1_is_min_tile_width_satisfied(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Minimum tile width requirement not satisfied");
   }
 
   CommonQuantParams *const quant_params = &cm->quant_params;
   setup_quantization(quant_params, av1_num_planes(cm),
-                     cm->seq_params.separate_uv_delta_q, rb);
+                     cm->seq_params->separate_uv_delta_q, rb);
   xd->bd = (int)seq_params->bit_depth;
 
   CommonContexts *const above_contexts = &cm->above_contexts;
@@ -4990,7 +5022,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
                                         cm->mi_params.mi_cols,
                                         av1_num_planes(cm))) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
     }
   }
@@ -5070,7 +5102,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   features->reduced_tx_set_used = aom_rb_read_bit(rb);
 
   if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Frame wrongly requests reference frame MVs");
   }
 
@@ -5170,7 +5202,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
       // Use the default frame context values.
       *cm->fc = *cm->default_frame_context;
       if (!cm->fc->initialized)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Uninitialized entropy context.");
     }
     return uncomp_hdr_size;
@@ -5180,8 +5212,8 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
 
   av1_setup_motion_field(cm);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
   if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = *cm->default_frame_context;
@@ -5189,7 +5221,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
     *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
   }
   if (!cm->fc->initialized)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
   pbi->dcb.corrupted = 0;
@@ -5207,7 +5239,7 @@ static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
     av1_alloc_restoration_buffers(cm);
   }
 #endif
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td);
@@ -5242,13 +5274,17 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
 
   // If the bit stream is monochrome, set the U and V buffers to a constant.
   if (num_planes < 3) {
-    set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+    set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1);
   }
 
   if (end_tile != tiles->rows * tiles->cols - 1) {
     return;
   }
 
+  av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync,
+                         pbi->num_workers);
+  av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers);
+
   if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
       if (pbi->num_workers > 1) {
@@ -5257,13 +5293,13 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
 #if CONFIG_LPF_MASK
             1,
 #endif
-            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
+            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync, 0);
       } else {
         av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->dcb.xd,
 #if CONFIG_LPF_MASK
                               1,
 #endif
-                              0, num_planes, 0);
+                              0, num_planes, 0, 0);
       }
     }
 
@@ -5285,7 +5321,14 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
                                                  cm, 0);
 
       if (do_cdef) {
-        av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd);
+        if (pbi->num_workers > 1) {
+          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
+                            pbi->tile_workers, &pbi->cdef_sync,
+                            pbi->num_workers, av1_cdef_init_fb_row_mt);
+        } else {
+          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
+                         av1_cdef_init_fb_row);
+        }
       }
 
       superres_post_decode(pbi);
@@ -5323,7 +5366,14 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
 #else
     if (!optimized_loop_restoration) {
       if (do_cdef) {
-        av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd);
+        if (pbi->num_workers > 1) {
+          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
+                            pbi->tile_workers, &pbi->cdef_sync,
+                            pbi->num_workers, av1_cdef_init_fb_row_mt);
+        } else {
+          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
+                         av1_cdef_init_fb_row);
+        }
       }
     }
 #endif  // !CONFIG_REALTIME_ONLY
@@ -5339,7 +5389,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
       av1_reset_cdf_symbol_counters(cm->fc);
     }
   } else {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Decode failed. Frame data is corrupted.");
   }
 
diff --git a/third_party/libaom/source/libaom/av1/decoder/decodemv.c b/third_party/libaom/source/libaom/av1/decoder/decodemv.c
index 412be86989..839bda2be6 100644
--- a/third_party/libaom/source/libaom/av1/decoder/decodemv.c
+++ b/third_party/libaom/source/libaom/av1/decoder/decodemv.c
@@ -46,7 +46,7 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
 
   // At the start of a superblock, mark that we haven't yet read CDEF strengths
   // for any of the CDEF units contained in this superblock.
-  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int sb_mask = (cm->seq_params->mib_size - 1);
   const int mi_row_in_sb = (xd->mi_row & sb_mask);
   const int mi_col_in_sb = (xd->mi_col & sb_mask);
   if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
@@ -61,7 +61,7 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
   const int index_mask = cdef_size;
   const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
-  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
 
@@ -85,12 +85,12 @@ static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
                              aom_reader *r, MB_MODE_INFO *const mbmi) {
   int sign, abs, reduced_delta_qindex = 0;
   BLOCK_SIZE bsize = mbmi->bsize;
-  const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1);
+  const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1);
+  const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) &&
+  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
       read_delta_q_flag) {
     abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_Q_SMALL);
@@ -117,11 +117,11 @@ static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
                               int mi_row) {
   int reduced_delta_lflevel = 0;
   const BLOCK_SIZE bsize = mbmi->bsize;
-  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const int b_col = mi_col & (cm->seq_params->mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params->mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) &&
+  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
       read_delta_lf_flag) {
     int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_LF_SMALL);
@@ -579,7 +579,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
           aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
+      read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r);
     }
   }
   if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
@@ -591,7 +591,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
           aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
+      read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r);
     }
   }
 }
@@ -682,7 +682,7 @@ static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
   mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
   int valid = is_mv_valid(&mv->as_mv) &&
               av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
-                              cm->seq_params.mib_size_log2);
+                              cm->seq_params->mib_size_log2);
   return valid;
 }
 
@@ -711,7 +711,7 @@ static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb,
     av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
     int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
     if (dv_ref.as_int == 0)
-      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row);
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row);
     // Ref DV should not have sub-pel.
     int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
     dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
@@ -816,7 +816,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
 
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -1076,7 +1076,7 @@ static void read_intra_block_mode_info(AV1_COMMON *const cm,
       use_angle_delta && av1_is_directional_mode(mbmi->mode)
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -1375,7 +1375,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
 
   mbmi->use_wedge_interintra = 0;
-  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+  if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode &&
       is_interintra_allowed(mbmi)) {
     const int bsize_group = size_group_lookup[bsize];
     const int interintra =
@@ -1423,7 +1423,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   if (has_second_ref(mbmi) && !mbmi->skip_mode) {
     // Read idx to indicate current compound inter prediction mode group
     const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                     cm->seq_params.enable_masked_compound;
+                                     cm->seq_params->enable_masked_compound;
 
     if (masked_compound_used) {
       const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
@@ -1432,7 +1432,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
     }
 
     if (mbmi->comp_group_idx == 0) {
-      if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+      if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
         const int comp_index_ctx = get_comp_index_context(cm, xd);
         mbmi->compound_idx = (uint8_t)aom_read_symbol(
             r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
@@ -1473,7 +1473,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   }
 
   read_mb_interp_filter(xd, features->interp_filter,
-                        cm->seq_params.enable_dual_filter, mbmi, r);
+                        cm->seq_params->enable_dual_filter, mbmi, r);
 
 #if !CONFIG_REALTIME_ONLY
   if (mbmi->motion_mode == WARPED_CAUSAL) {
@@ -1573,11 +1573,11 @@ void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, dcb, r);
-    if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
       intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
   } else {
     read_inter_frame_mode_info(pbi, dcb, r);
-    if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
       av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
   }
 }
diff --git a/third_party/libaom/source/libaom/av1/decoder/decoder.c b/third_party/libaom/source/libaom/av1/decoder/decoder.c
index 1680734a09..40dd71cea2 100644
--- a/third_party/libaom/source/libaom/av1/decoder/decoder.c
+++ b/third_party/libaom/source/libaom/av1/decoder/decoder.c
@@ -97,17 +97,19 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   av1_zero(*pbi);
 
   AV1_COMMON *volatile const cm = &pbi->common;
+  cm->seq_params = &pbi->seq_params;
+  cm->error = &pbi->error;
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
-    cm->error.setjmp = 0;
+  if (setjmp(pbi->error.jmp)) {
+    pbi->error.setjmp = 0;
     av1_decoder_remove(pbi);
     return NULL;
   }
 
-  cm->error.setjmp = 1;
+  pbi->error.setjmp = 1;
 
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
@@ -129,7 +131,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
-  cm->seq_params.bit_depth = AOM_BITS_8;
+  cm->seq_params->bit_depth = AOM_BITS_8;
 
   cm->mi_params.free_mi = dec_free_mi;
   cm->mi_params.setup_mi = dec_setup_mi;
@@ -146,7 +148,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   aom_accounting_init(&pbi->accounting);
 #endif
 
-  cm->error.setjmp = 0;
+  pbi->error.setjmp = 0;
 
   aom_get_worker_interface()->init(&pbi->lf_worker);
   pbi->lf_worker.thread_name = "aom lf worker";
@@ -194,6 +196,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
     }
     aom_free(pbi->thread_data);
   }
+  aom_free(pbi->dcb.xd.seg_mask);
 
   for (i = 0; i < pbi->num_workers; ++i) {
     AVxWorker *const worker = &pbi->tile_workers[i];
@@ -261,16 +264,16 @@ aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
 
   const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
   if (cfg == NULL) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame");
     return AOM_CODEC_ERROR;
   }
   if (!equal_dimensions(cfg, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(cfg, sd, num_planes);
 
-  return cm->error.error_code;
+  return pbi->error.error_code;
 }
 
 static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
@@ -293,13 +296,13 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
   ref_buf = get_ref_frame(cm, idx);
 
   if (ref_buf == NULL) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame");
     return AOM_CODEC_ERROR;
   }
 
   if (!use_external_ref) {
     if (!equal_dimensions(ref_buf, sd)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     } else {
       // Overwrite the reference frame buffer.
@@ -307,7 +310,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
     }
   } else {
     if (!equal_dimensions_and_border(ref_buf, sd)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     } else {
       // Overwrite the reference frame buffer pointers.
@@ -323,7 +326,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
     }
   }
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
@@ -332,12 +335,12 @@ aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
   const int num_planes = av1_num_planes(cm);
 
   if (!equal_dimensions_and_border(new_frame, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(new_frame, sd, num_planes);
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 static void release_current_frame(AV1Decoder *pbi) {
@@ -355,7 +358,7 @@ static void release_current_frame(AV1Decoder *pbi) {
 // Consumes a reference to cm->cur_frame.
 //
 // This functions returns void. It reports failure by setting
-// cm->error.error_code.
+// pbi->error.error_code.
 static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
@@ -388,7 +391,7 @@ static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
           // error
           cm->cur_frame->buf.corrupted = 1;
           decrease_ref_count(cm->cur_frame, pool);
-          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         } else {
           pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
           pbi->num_output_frames++;
@@ -427,8 +430,8 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
                                 const uint8_t **psource) {
   AV1_COMMON *volatile const cm = &pbi->common;
   const uint8_t *source = *psource;
-  cm->error.error_code = AOM_CODEC_OK;
-  cm->error.has_detail = 0;
+  pbi->error.error_code = AOM_CODEC_OK;
+  pbi->error.has_detail = 0;
 
   if (size == 0) {
     // This is used to signal that we are missing frames.
@@ -444,18 +447,18 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   }
 
   if (assign_cur_frame_new_fb(cm) == NULL) {
-    cm->error.error_code = AOM_CODEC_MEM_ERROR;
+    pbi->error.error_code = AOM_CODEC_MEM_ERROR;
     return 1;
   }
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
+  if (setjmp(pbi->error.jmp)) {
     const AVxWorkerInterface *const winterface = aom_get_worker_interface();
     int i;
 
-    cm->error.setjmp = 0;
+    pbi->error.setjmp = 0;
 
     // Synchronize all threads immediately as a subsequent decode call may
     // cause a resize invalidating some allocations.
@@ -469,15 +472,15 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     return -1;
   }
 
-  cm->error.setjmp = 1;
+  pbi->error.setjmp = 1;
 
   int frame_decoded =
       aom_decode_frame_from_obus(pbi, source, source + size, psource);
 
   if (frame_decoded < 0) {
-    assert(cm->error.error_code != AOM_CODEC_OK);
+    assert(pbi->error.error_code != AOM_CODEC_OK);
     release_current_frame(pbi);
-    cm->error.setjmp = 0;
+    pbi->error.setjmp = 0;
     return 1;
   }
 
@@ -498,8 +501,8 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     pbi->decoding_first_frame = 0;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) {
-    cm->error.setjmp = 0;
+  if (pbi->error.error_code != AOM_CODEC_OK) {
+    pbi->error.setjmp = 0;
     return 1;
   }
 
@@ -518,7 +521,7 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   }
 
   // Update progress in frame parallel decode.
-  cm->error.setjmp = 0;
+  pbi->error.setjmp = 0;
 
   return 0;
 }
diff --git a/third_party/libaom/source/libaom/av1/decoder/decoder.h b/third_party/libaom/source/libaom/av1/decoder/decoder.h
index b20e9c1dda..226b9dca85 100644
--- a/third_party/libaom/source/libaom/av1/decoder/decoder.h
+++ b/third_party/libaom/source/libaom/av1/decoder/decoder.h
@@ -112,6 +112,8 @@ typedef struct ThreadData {
   // Motion compensation buffer used to get a prediction buffer with extended
   // borders. One buffer for each of the two possible references.
   uint8_t *mc_buf[2];
+  // Mask for this block used for compound prediction.
+  uint8_t *seg_mask;
   // Allocated size of 'mc_buf'.
   int32_t mc_buf_size;
   // If true, the pointers in 'mc_buf' were converted from highbd pointers.
@@ -227,6 +229,8 @@ typedef struct AV1Decoder {
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
   AV1LrStruct lr_ctxt;
+  AV1CdefSync cdef_sync;
+  AV1CdefWorkerData *cdef_worker;
   AVxWorker *tile_workers;
   int num_workers;
   DecWorkerData *thread_data;
@@ -330,6 +334,32 @@ typedef struct AV1Decoder {
   int is_arf_frame_present;
   int num_tile_groups;
   aom_s_frame_info sframe_info;
+
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
 } AV1Decoder;
 
 // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
diff --git a/third_party/libaom/source/libaom/av1/decoder/obu.c b/third_party/libaom/source/libaom/av1/decoder/obu.c
index d3d1f0e8be..6c80148cc9 100644
--- a/third_party/libaom/source/libaom/av1/decoder/obu.c
+++ b/third_party/libaom/source/libaom/av1/decoder/obu.c
@@ -69,7 +69,7 @@ static int byte_alignment(AV1_COMMON *const cm,
                           struct aom_read_bit_buffer *const rb) {
   while (rb->bit_offset & 7) {
     if (aom_rb_read_bit(rb)) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
   }
@@ -110,12 +110,12 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
 
   // Use a local variable to store the information as we decode. At the end,
   // if no errors have occurred, cm->seq_params is updated.
-  SequenceHeader sh = cm->seq_params;
+  SequenceHeader sh = *cm->seq_params;
   SequenceHeader *const seq_params = &sh;
 
   seq_params->profile = av1_read_profile(rb);
   if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
-    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
@@ -124,7 +124,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
   // Video must have reduced_still_picture_hdr = 0
   if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
-    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
@@ -135,7 +135,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     seq_params->operating_points_cnt_minus_1 = 0;
     seq_params->operating_point_idc[0] = 0;
     if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
-      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
       return 0;
     }
     seq_params->tier[0] = 0;
@@ -144,7 +144,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   } else {
     seq_params->timing_info_present = aom_rb_read_bit(rb);
     if (seq_params->timing_info_present) {
-      av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb);
+      av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb);
 
       seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
       if (seq_params->decoder_model_info_present_flag)
@@ -159,7 +159,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
       seq_params->operating_point_idc[i] =
           aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
       if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
-        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         return 0;
       }
       // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
@@ -188,7 +188,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
         // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
         // the check
         if (seq_params->op_params[i].bitrate == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "AV1 does not support this combination of "
                              "profile, level, and tier.");
         // Buffer size in bits/s is bitrate in bits/s * 1 s
@@ -212,7 +212,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
               aom_rb_read_literal(rb, 4) + 1;
           if (seq_params->op_params[i].initial_display_delay > 10)
             aom_internal_error(
-                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                 "AV1 does not support more than 10 decoded frames delay");
         } else {
           seq_params->op_params[i].initial_display_delay = 10;
@@ -232,19 +232,19 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   pbi->current_operating_point =
       seq_params->operating_point_idc[operating_point];
   if (aom_get_num_layers_from_operating_point_idc(
-          pbi->current_operating_point, &cm->number_spatial_layers,
-          &cm->number_temporal_layers) != AOM_CODEC_OK) {
-    cm->error.error_code = AOM_CODEC_ERROR;
+          pbi->current_operating_point, &pbi->number_spatial_layers,
+          &pbi->number_temporal_layers) != AOM_CODEC_OK) {
+    pbi->error.error_code = AOM_CODEC_ERROR;
     return 0;
   }
 
   av1_read_sequence_header(cm, rb, seq_params);
 
-  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error);
   if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                        "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
                        "%d %d subsampling is not supported.\n",
                        seq_params->subsampling_x, seq_params->subsampling_y);
@@ -253,18 +253,18 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   seq_params->film_grain_params_present = aom_rb_read_bit(rb);
 
   if (av1_check_trailing_bits(pbi, rb) != 0) {
-    // cm->error.error_code is already set.
+    // pbi->error.error_code is already set.
     return 0;
   }
 
   // If a sequence header has been decoded before, we check if the new
   // one is consistent with the old one.
   if (pbi->sequence_header_ready) {
-    if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+    if (!are_seq_headers_consistent(cm->seq_params, seq_params))
       pbi->sequence_header_changed = 1;
   }
 
-  cm->seq_params = *seq_params;
+  *cm->seq_params = *seq_params;
   pbi->sequence_header_ready = 1;
 
   return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
@@ -303,7 +303,7 @@ static int32_t read_tile_group_header(AV1Decoder *pbi,
     tile_start_and_end_present_flag = aom_rb_read_bit(rb);
     if (tile_start_implicit && tile_start_and_end_present_flag) {
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
           "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
       return -1;
     }
@@ -318,20 +318,20 @@ static int32_t read_tile_group_header(AV1Decoder *pbi,
     *end_tile = aom_rb_read_literal(rb, tile_bits);
   }
   if (*start_tile != pbi->next_start_tile) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "tg_start (%d) must be equal to %d", *start_tile,
                        pbi->next_start_tile);
     return -1;
   }
   if (*start_tile > *end_tile) {
     aom_internal_error(
-        &cm->error, AOM_CODEC_CORRUPT_FRAME,
+        &pbi->error, AOM_CODEC_CORRUPT_FRAME,
         "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
         *start_tile);
     return -1;
   }
   if (*end_tile >= num_tiles) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
                        num_tiles);
     return -1;
@@ -388,15 +388,16 @@ static void alloc_tile_list_buffer(AV1Decoder *pbi) {
              (pbi->output_frame_height_in_tiles_minus_1 + 1));
 
   // Allocate the tile list output buffer.
-  // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth
-  // is 8, we could allocate less memory, namely, 8 bits/pixel.
+  // Note: if cm->seq_params->use_highbitdepth is 1 and
+  // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8
+  // bits/pixel.
   if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
-                             output_frame_height, cm->seq_params.subsampling_x,
-                             cm->seq_params.subsampling_y,
-                             (cm->seq_params.use_highbitdepth &&
-                              (cm->seq_params.bit_depth > AOM_BITS_8)),
+                             output_frame_height, cm->seq_params->subsampling_x,
+                             cm->seq_params->subsampling_y,
+                             (cm->seq_params->use_highbitdepth &&
+                              (cm->seq_params->bit_depth > AOM_BITS_8)),
                              0, cm->features.byte_alignment))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate the tile list output buffer");
 }
 
@@ -430,8 +431,8 @@ static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
   av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
   const int tile_width_in_pixels = tile_width * MI_SIZE;
   const int tile_height_in_pixels = tile_height * MI_SIZE;
-  const int ssy = cm->seq_params.subsampling_y;
-  const int ssx = cm->seq_params.subsampling_x;
+  const int ssy = cm->seq_params->subsampling_y;
+  const int ssx = cm->seq_params->subsampling_x;
   const int num_planes = av1_num_planes(cm);
 
   YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
@@ -455,8 +456,8 @@ static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
     int vstart2 = tr * h;
     int hstart2 = tc * w;
 
-    if (cm->seq_params.use_highbitdepth &&
-        cm->seq_params.bit_depth == AOM_BITS_8) {
+    if (cm->seq_params->use_highbitdepth &&
+        cm->seq_params->bit_depth == AOM_BITS_8) {
       yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1,
                      &pbi->tile_list_outbuf, hstart2, vstart2, plane);
     } else {
@@ -501,7 +502,7 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
   pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
   pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
   if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return 0;
   }
 
@@ -524,7 +525,7 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
     // Set reference for each tile.
     int ref_idx = aom_rb_read_literal(rb, 8);
     if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
@@ -535,14 +536,14 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
     if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
         pbi->dec_tile_row >= cm->tiles.rows ||
         pbi->dec_tile_col >= cm->tiles.cols) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
 
     pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
     data += tile_info_bytes;
     if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
 
@@ -581,18 +582,17 @@ static void alloc_read_metadata(AV1Decoder *const pbi,
                                 OBU_METADATA_TYPE metadata_type,
                                 const uint8_t *data, size_t sz,
                                 aom_metadata_insert_flags_t insert_flag) {
-  AV1_COMMON *const cm = &pbi->common;
   if (!pbi->metadata) {
     pbi->metadata = aom_img_metadata_array_alloc(0);
     if (!pbi->metadata) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate metadata array");
     }
   }
   aom_metadata_t *metadata =
       aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
   if (!metadata) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Error allocating metadata");
   }
   aom_metadata_t **metadata_array =
@@ -600,7 +600,7 @@ static void alloc_read_metadata(AV1Decoder *const pbi,
                                  (pbi->metadata->sz + 1) * sizeof(metadata));
   if (!metadata_array) {
     aom_img_metadata_free(metadata);
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Error growing metadata array");
   }
   pbi->metadata->metadata_array = metadata_array;
@@ -611,22 +611,21 @@ static void alloc_read_metadata(AV1Decoder *const pbi,
 // On failure, calls aom_internal_error() and does not return.
 static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
                                    size_t sz) {
-  AV1_COMMON *const cm = &pbi->common;
   if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "itu_t_t35_country_code is missing");
   }
   int country_code_size = 1;
   if (*data == 0xFF) {
     if (sz == 1) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "itu_t_t35_country_code_extension_byte is missing");
     }
     ++country_code_size;
   }
   int end_index = get_last_nonzero_byte_index(data, sz);
   if (end_index < country_code_size) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "No trailing bits found in ITU-T T.35 metadata OBU");
   }
   // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
@@ -634,7 +633,7 @@ static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
   //   specified in Recommendation ITU-T T.35.
   // Therefore the first trailing byte should be 0x80.
   if (data[end_index] != 0x80) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "The last nonzero byte of the ITU-T T.35 metadata OBU "
                        "is 0x%02x, should be 0x80.",
                        data[end_index]);
@@ -648,9 +647,8 @@ static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
 static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
                                     size_t sz) {
   const size_t kHdrCllPayloadSize = 4;
-  AV1_COMMON *const cm = &pbi->common;
   if (sz < kHdrCllPayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR CLL metadata payload size");
   }
   alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
@@ -663,9 +661,8 @@ static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
 static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
                                      size_t sz) {
   const size_t kMdcvPayloadSize = 24;
-  AV1_COMMON *const cm = &pbi->common;
   if (sz < kMdcvPayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR MDCV metadata payload size");
   }
   alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
@@ -770,11 +767,10 @@ static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
 // pbi->common.error.error_code and returns 0, or calls aom_internal_error()
 // and does not return.
 static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
-  AV1_COMMON *const cm = &pbi->common;
   size_t type_length;
   uint64_t type_value;
   if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return 0;
   }
   const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
@@ -782,7 +778,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
     // If metadata_type is reserved for future use or a user private value,
     // ignore the entire OBU and just check trailing bits.
     if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -796,7 +792,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
         type_length +
         read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -805,7 +801,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
         type_length +
         read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -820,7 +816,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
     read_metadata_timecode(&rb);
   }
   if (av1_check_trailing_bits(pbi, &rb) != 0) {
-    // cm->error.error_code is already set.
+    // pbi->error.error_code is already set.
     return 0;
   }
   assert((rb.bit_offset & 7) == 0);
@@ -838,7 +834,7 @@ static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
     // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
     const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
     if (last_nonzero_byte != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
   }
@@ -846,7 +842,7 @@ static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
 }
 
 // On success, returns a boolean that indicates whether the decoding of the
-// current frame is finished. On failure, sets cm->error.error_code and
+// current frame is finished. On failure, sets pbi->error.error_code and
 // returns -1.
 int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
                                const uint8_t *data_end,
@@ -872,7 +868,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
   pbi->num_tile_groups = 0;
 
   if (data_end < data) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
 
@@ -880,7 +876,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
   if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
 
   // decode frame as a series of OBUs
-  while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) {
+  while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) {
     struct aom_read_bit_buffer rb;
     size_t payload_size = 0;
     size_t decoded_payload_size = 0;
@@ -890,7 +886,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
     if (bytes_available == 0 && !pbi->seen_frame_header) {
       *p_data_end = data;
-      cm->error.error_code = AOM_CODEC_OK;
+      pbi->error.error_code = AOM_CODEC_OK;
       break;
     }
 
@@ -899,7 +895,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
                                      &obu_header, &payload_size, &bytes_read);
 
     if (status != AOM_CODEC_OK) {
-      cm->error.error_code = status;
+      pbi->error.error_code = status;
       return -1;
     }
 
@@ -912,7 +908,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     data += bytes_read;
 
     if ((size_t)(data_end - data) < payload_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
 
@@ -936,16 +932,16 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         if (pbi->seen_frame_header) {
           // A new temporal unit has started, but the frame in the previous
           // temporal unit is incomplete.
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         break;
       case OBU_SEQUENCE_HEADER:
         decoded_payload_size = read_sequence_header_obu(pbi, &rb);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         // The sequence header should not change in the middle of a frame.
         if (pbi->sequence_header_changed && pbi->seen_frame_header) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         break;
@@ -954,13 +950,13 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
       case OBU_FRAME:
         if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
           if (!pbi->seen_frame_header) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
         } else {
           // OBU_FRAME_HEADER or OBU_FRAME.
           if (pbi->seen_frame_header) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
         }
@@ -978,7 +974,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
           // frame_header_obu.
           if (frame_header_size > payload_size ||
               memcmp(data, frame_header, frame_header_size) != 0) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
           assert(rb.bit_offset == 0);
@@ -990,7 +986,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
         if (cm->show_existing_frame) {
           if (obu_header.type == OBU_FRAME) {
-            cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+            pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
             return -1;
           }
           frame_decoding_finished = 1;
@@ -1012,23 +1008,23 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         if (obu_header.type != OBU_FRAME) break;
         obu_payload_offset = frame_header_size;
         // Byte align the reader before reading the tile group.
-        // byte_alignment() has set cm->error.error_code if it returns -1.
+        // byte_alignment() has set pbi->error.error_code if it returns -1.
         if (byte_alignment(cm, &rb)) return -1;
         AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
       case OBU_TILE_GROUP:
         if (!pbi->seen_frame_header) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         if (obu_payload_offset > payload_size) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         decoded_payload_size += read_one_tile_group_obu(
             pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
             data + payload_size, p_data_end, &frame_decoding_finished,
             obu_header.type == OBU_FRAME);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         is_first_tg_obu_received = 0;
         if (frame_decoding_finished) {
           pbi->seen_frame_header = 0;
@@ -1038,18 +1034,18 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         break;
       case OBU_METADATA:
         decoded_payload_size = read_metadata(pbi, data, payload_size);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_TILE_LIST:
         if (CONFIG_NORMAL_TILE_MODE) {
-          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
           return -1;
         }
 
         // This OBU type is purely for the large scale tile coding mode.
         // The common camera frame header has to be already decoded.
         if (!pbi->camera_frame_header_ready) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
 
@@ -1058,17 +1054,17 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         decoded_payload_size =
             read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
                                           p_data_end, &frame_decoding_finished);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_PADDING:
         decoded_payload_size = read_padding(cm, data, payload_size);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       default:
         // Skip unrecognized OBUs
         if (payload_size > 0 &&
             get_last_nonzero_byte(data, payload_size) == 0) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         decoded_payload_size = payload_size;
@@ -1077,7 +1073,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
     // Check that the signalled OBU size matches the actual amount of data read
     if (decoded_payload_size > payload_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
 
@@ -1085,7 +1081,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     while (decoded_payload_size < payload_size) {
       uint8_t padding_byte = data[decoded_payload_size++];
       if (padding_byte != 0) {
-        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
         return -1;
       }
     }
@@ -1093,6 +1089,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     data += payload_size;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) return -1;
+  if (pbi->error.error_code != AOM_CODEC_OK) return -1;
   return frame_decoding_finished;
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c b/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c
index 3ea5f63020..278e1ca92f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c
+++ b/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c
@@ -81,7 +81,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
   if (is_frame_aq_enabled(cpi)) {
     int segment;
     const int aq_strength =
-        get_aq_c_strength(base_qindex, cm->seq_params.bit_depth);
+        get_aq_c_strength(base_qindex, cm->seq_params->bit_depth);
 
     // Clear down the segment map.
     memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
@@ -108,7 +108,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
       qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cpi->is_screen_content_type,
-          cm->seq_params.bit_depth);
+          cm->seq_params->bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -150,17 +150,17 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
     // It is converted to bits << AV1_PROB_COST_SHIFT units.
     const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
                         << AV1_PROB_COST_SHIFT;
-    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
+    const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
-                                              cm->seq_params.bit_depth);
+                                              cm->seq_params->bit_depth);
 
     aom_clear_system_state();
     low_var_thresh =
         (is_stat_consumption_stage_twopass(cpi))
-            ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+            ? AOMMAX(exp(cpi->ppi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
             : DEFAULT_LV_THRESH;
 
     av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c
index c7abe43c87..40b8c254d4 100644
--- a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c
@@ -12,6 +12,7 @@
 #include <limits.h>
 #include <math.h>
 
+#include "av1/common/pred_common.h"
 #include "av1/common/seg_common.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/ratectrl.h"
@@ -82,7 +83,7 @@ static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   int deltaq = av1_compute_qdelta_by_rate(
       rc, cpi->common.current_frame.frame_type, q, rate_factor,
-      cpi->is_screen_content_type, cpi->common.seq_params.bit_depth);
+      cpi->is_screen_content_type, cpi->common.seq_params->bit_depth);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
   }
@@ -94,7 +95,7 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
   const AV1_COMMON *const cm = &cpi->common;
   const FRAME_TYPE frame_type = cm->current_frame.frame_type;
   const int base_qindex = cm->quant_params.base_qindex;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int mbs = cm->mi_params.MBs;
   const int num4x4bl = mbs << 4;
@@ -138,15 +139,51 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
   bits_per_mb =
       (int)((1.0 - weight_segment) *
                 av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
-                                   correction_factor, cm->seq_params.bit_depth,
+                                   correction_factor, cm->seq_params->bit_depth,
                                    cpi->is_screen_content_type) +
             weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type,
                                                 i + deltaq, correction_factor,
-                                                cm->seq_params.bit_depth,
+                                                cm->seq_params->bit_depth,
                                                 cpi->is_screen_content_type));
   return bits_per_mb;
 }
 
+void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  int cdf_num;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int prev_segment_id = mbmi->segment_id;
+  mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+  if (prev_segment_id != mbmi->segment_id) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    const int bw = mi_size_wide[bsize];
+    const int bh = mi_size_high[bsize];
+    const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+    const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+    const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+    for (int mi_y = 0; mi_y < ymis; mi_y++) {
+      for (int mi_x = 0; mi_x < xmis; mi_x++) {
+        const int map_offset =
+            block_index + mi_y * cm->mi_params.mi_cols + mi_x;
+        cr->map[map_offset] = 0;
+        cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+        cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
+      }
+    }
+    if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks -= xmis * ymis;
+    else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks -= xmis * ymis;
+    if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks += xmis * ymis;
+    else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
+             CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks += xmis * ymis;
+  }
+}
+
 void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int64_t rate, int64_t dist, int skip,
@@ -191,22 +228,21 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  // 8x8 is smallest coding block size for non-key frames.
-  const int sh = bw << 1;
-  for (int mi_y = 0; mi_y < ymis; mi_y += 2) {
-    for (int mi_x = 0; mi_x < xmis; mi_x += 2) {
-      int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x;
+  for (int mi_y = 0; mi_y < ymis; mi_y++) {
+    for (int mi_x = 0; mi_x < xmis; mi_x++) {
+      const int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x;
       cr->map[map_offset] = new_map_value;
       cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+      cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
     }
-    // Accumulate cyclic refresh update counters.
-    if (!dry_run && !frame_is_intra_only(cm)) {
-      if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
-        x->actual_num_seg1_blocks += sh;
-      else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
-               CR_SEGMENT_ID_BOOST2)
-        x->actual_num_seg2_blocks += sh;
-    }
+  }
+  // Accumulate cyclic refresh update counters.
+  if (!dry_run) {
+    if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks += xmis * ymis;
+    else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
+             CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks += xmis * ymis;
   }
 }
 
@@ -234,15 +270,15 @@ void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
   const int avg_cnt_zeromv =
       100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
 
-  if (!cpi->use_svc ||
-      (cpi->use_svc &&
+  if (!cpi->ppi->use_svc ||
+      (cpi->ppi->use_svc &&
        !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
     rc->avg_frame_low_motion =
         (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
     // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
     // to all lower spatial layers.
-    if (cpi->use_svc &&
+    if (cpi->ppi->use_svc &&
         svc->spatial_layer_id == svc->number_spatial_layers - 1) {
       for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
         const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
@@ -257,15 +293,16 @@ void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
 
 void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   // Set minimum gf_interval for GF update to a multiple of the refresh period,
   // with some max limit. Depending on past encoding stats, GF flag may be
   // reset and update may not occur until next baseline_gf_interval.
   if (cr->percent_refresh > 0)
-    rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
+    p_rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
   else
-    rc->baseline_gf_interval = 20;
-  if (rc->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
+    p_rc->baseline_gf_interval = 20;
+  if (rc->avg_frame_low_motion < 40) p_rc->baseline_gf_interval = 8;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -282,10 +319,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols);
-  sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) /
-            cm->seq_params.mib_size;
-  sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) /
-            cm->seq_params.mib_size;
+  sb_cols = (mi_params->mi_cols + cm->seq_params->mib_size - 1) /
+            cm->seq_params->mib_size;
+  sb_rows = (mi_params->mi_rows + cm->seq_params->mib_size - 1) /
+            cm->seq_params->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count =
@@ -302,8 +339,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * cm->seq_params.mib_size;
-    int mi_col = sb_col_index * cm->seq_params.mib_size;
+    int mi_row = sb_row_index * cm->seq_params->mib_size;
+    int mi_col = sb_col_index * cm->seq_params->mib_size;
     // TODO(any): Ensure the population of
     // cpi->common.features.allow_screen_content_tools and use the same instead
     // of cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
@@ -315,8 +352,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
     bl_index = mi_row * mi_params->mi_cols + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size);
-    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size);
+    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params->mib_size);
+    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params->mib_size);
     // cr_map only needed at 8x8 blocks.
     for (y = 0; y < ymis; y += 2) {
       for (x = 0; x < xmis; x += 2) {
@@ -361,11 +398,20 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
   cr->apply_cyclic_refresh = 1;
+  int avg_frame_qindex_inter_frame;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  avg_frame_qindex_inter_frame =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? cpi->ppi->temp_avg_frame_qindex[INTER_FRAME]
+          : rc->avg_frame_qindex[INTER_FRAME];
+#else
+  avg_frame_qindex_inter_frame = rc->avg_frame_qindex[INTER_FRAME];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
       cpi->svc.temporal_layer_id > 0 ||
-      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      avg_frame_qindex_inter_frame < qp_thresh ||
       (rc->frames_since_key > 20 &&
-       rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+       avg_frame_qindex_inter_frame > qp_max_thresh) ||
       (rc->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
     cr->apply_cyclic_refresh = 0;
     return;
@@ -446,7 +492,7 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
     return;
   } else {
     const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
-                                             cm->seq_params.bit_depth);
+                                             cm->seq_params->bit_depth);
     aom_clear_system_state();
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h
index 97bd6f26b1..1c0d5cb4d7 100644
--- a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h
+++ b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h
@@ -161,6 +161,30 @@ int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
 int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
                                       double correction_factor);
 
+/*!\brief Update segment_id for blocks are skipped.
+ *
+ * After encoding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id based on skip_txfm,
+ * and update the cyclic_refresh map and segmentation counters.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ *
+ * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+
+void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
+                                   MACROBLOCK *const x, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
+
 /*!\brief Update segment_id for block based on mode selected.
  *
  * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_variance.c b/third_party/libaom/source/libaom/av1/encoder/aq_variance.c
index 92d7ad172d..79bf9f8419 100644
--- a/third_party/libaom/source/libaom/av1/encoder/aq_variance.c
+++ b/third_party/libaom/source/libaom/av1/encoder/aq_variance.c
@@ -52,7 +52,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
-  int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+  int avg_energy = (int)(cpi->ppi->twopass.mb_av_energy - 2);
   double avg_ratio;
   if (avg_energy > 7) avg_energy = 7;
   if (avg_energy < 0) avg_energy = 0;
@@ -81,7 +81,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
       int qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
           rate_ratio[i] / avg_ratio, cpi->is_screen_content_type,
-          cm->seq_params.bit_depth);
+          cm->seq_params->bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -126,14 +126,14 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
     for (j = 0; j < bw; j += 4) {
       if (is_cur_buf_hbd(xd)) {
         var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride,
                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
                           16);
       } else {
         var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
                           16);
@@ -154,15 +154,12 @@ static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   MACROBLOCKD *xd = &x->e_mbd;
   int stride = x->plane[0].src.stride;
   uint8_t *buf = x->plane[0].src.buf;
-  const int bw = MI_SIZE * mi_size_wide[bs];
-  const int bh = MI_SIZE * mi_size_high[bs];
+  const int num_8x8_cols = block_size_wide[bs] / 8;
+  const int num_8x8_rows = block_size_high[bs] / 8;
   const int hbd = is_cur_buf_hbd(xd);
 
-  int var = 0;
-  for (int r = 0; r < bh; r += 8)
-    for (int c = 0; c < bw; c += 8) {
-      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
-    }
+  int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows,
+                                                num_8x8_cols);
 
   return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
 }
@@ -178,7 +175,7 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
   double energy, energy_midpoint;
   aom_clear_system_state();
   energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
-                        ? cpi->twopass.frame_avg_haar_energy
+                        ? cpi->ppi->twopass.frame_avg_haar_energy
                         : DEFAULT_E_MIDPOINT;
   energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
@@ -199,7 +196,7 @@ int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
   int qindex_delta = av1_compute_qdelta_by_rate(
       &cpi->rc, cm->current_frame.frame_type, base_qindex,
       deltaq_rate_ratio[rate_level], cpi->is_screen_content_type,
-      cm->seq_params.bit_depth);
+      cm->seq_params->bit_depth);
 
   if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
     qindex_delta = -base_qindex + 1;
diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c b/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c
index dbc86c5034..8b2fc38923 100644
--- a/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c
+++ b/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c
@@ -27,8 +27,8 @@
 #if CONFIG_AV1_TEMPORAL_DENOISING
 // For SVC: only do noise estimation on top spatial layer.
 static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
-  return (!cpi->use_svc ||
-          (cpi->use_svc &&
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
            cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
 }
 #endif
@@ -61,7 +61,7 @@ static int enable_noise_estimation(AV1_COMP *const cpi) {
         cpi->common.height != resize_pending_params->height));
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (cpi->common.seq_params.use_highbitdepth) return 0;
+  if (cpi->common.seq_params->use_highbitdepth) return 0;
 #endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_AV1_TEMPORAL_DENOISING
@@ -75,7 +75,7 @@ static int enable_noise_estimation(AV1_COMP *const cpi) {
   // Not enabled for low resolutions.
   if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
       cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
-      resize_pending == 0 && !cpi->use_svc &&
+      resize_pending == 0 && !cpi->ppi->use_svc &&
       cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
       cpi->common.width * cpi->common.height >= 640 * 360)
     return 1;
@@ -227,7 +227,7 @@ void av1_update_noise_estimate(AV1_COMP *const cpi) {
             unsigned int sse;
             // Compute variance between co-located blocks from current and
             // last input frames.
-            unsigned int variance = cpi->fn_ptr[bsize].vf(
+            unsigned int variance = cpi->ppi->fn_ptr[bsize].vf(
                 src_y, src_ystride, last_src_y, last_src_ystride, &sse);
             unsigned int hist_index = variance / bin_size;
             if (hist_index < MAX_VAR_HIST_BINS)
diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c
index 9d38e2d77d..2b07e4c71b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c
+++ b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c
@@ -33,6 +33,40 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
   *eob_ptr = 0;
 }
 
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr) {
+  memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr));
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+  int eob = 0;
+  for (int i = 0; i < coeff_count; i++) {
+    const int rc = scan[i];
+    const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32 = 0;
+    if ((abs_coeff << (1 + log_scale)) >= thresh) {
+      abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+      tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+      if (tmp32) {
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff =
+            (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+    }
+    if (tmp32) eob = i + 1;
+  }
+  return eob;
+}
+
 static void quantize_fp_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -53,26 +87,9 @@ static void quantize_fp_helper_c(
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (qm_ptr == NULL && iqm_ptr == NULL) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = AOMSIGN(coeff);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int tmp32 = 0;
-      if ((abs_coeff << (1 + log_scale)) >= thresh) {
-        abs_coeff =
-            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
-        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
-        if (tmp32) {
-          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff =
-              (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
-          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-        }
-      }
-      if (tmp32) eob = i;
-    }
+    *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                          log_scale, scan, (int)n_coeffs,
+                                          coeff_ptr, qcoeff_ptr, dqcoeff_ptr);
   } else {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
@@ -100,8 +117,8 @@ static void quantize_fp_helper_c(
 
       if (tmp32) eob = i;
     }
+    *eob_ptr = eob + 1;
   }
-  *eob_ptr = eob + 1;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -767,7 +784,7 @@ void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
       aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
                       min_qmlevel, max_qmlevel);
 
-  if (!cm->seq_params.separate_uv_delta_q)
+  if (!cm->seq_params->separate_uv_delta_q)
     quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
   else
     quant_params->qmatrix_level_v =
diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h
index ad9619747a..215feb0603 100644
--- a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h
+++ b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h
@@ -118,6 +118,32 @@ int av1_qindex_to_quantizer(int qindex);
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
 
+/*!\brief Quantize transform coefficients without using qmatrix
+ *
+ * quant_ptr, dequant_ptr and round_ptr are size 2 arrays,
+ * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs.
+ *
+ * \param[in]  quant_ptr    16-bit fixed point representation of inverse
+ *                          quantize step size, i.e. 2^16/dequant
+ * \param[in]  dequant_ptr  quantize step size
+ * \param[in]  round_ptr    rounding
+ * \param[in]  log_scale    the relative log scale of the transform
+ *                          coefficients
+ * \param[in]  scan         scan[i] indicates the position of ith to-be-coded
+ *                          coefficient
+ * \param[in]  coeff_count  number of coefficients
+ * \param[out] qcoeff_ptr   quantized coefficients
+ * \param[out] dqcoeff_ptr  dequantized coefficients
+ *
+ * \return The last non-zero coefficient's scan index plus 1
+ */
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr);
+
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c b/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c
index 6c5bb930e1..96f3d7dcfe 100644
--- a/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c
+++ b/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c
@@ -349,7 +349,7 @@ void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
         &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
         motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
         cpi->source->y_width, cpi->svc.ref_idx[0], cpi->svc.ref_idx[3],
-        cpi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+        cpi->ppi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref);
 
   if (decision == FILTER_BLOCK) {
     decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -415,7 +415,7 @@ void av1_denoiser_update_frame_info(
     return;
   }
 
-  if (svc->external_ref_frame_config) {
+  if (svc->set_ref_frame_config) {
     int i;
     for (i = 0; i < REF_FRAMES; i++) {
       if (svc->refresh[svc->spatial_layer_id] & (1 << i))
@@ -485,8 +485,8 @@ static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm,
   if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
     fail = aom_alloc_frame_buffer(
         &denoiser->running_avg_y[fb_idx], cm->width, cm->height,
-        cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-        cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+        cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+        cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
         cm->features.byte_alignment);
     if (fail) {
       av1_denoiser_free(denoiser);
@@ -501,7 +501,7 @@ int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
                              int refresh_alt, int refresh_gld, int refresh_lst,
                              int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
   int fail = 0;
-  if (svc->external_ref_frame_config) {
+  if (svc->set_ref_frame_config) {
     int i;
     for (i = 0; i < REF_FRAMES; i++) {
       if (cm->current_frame.frame_type == KEY_FRAME ||
@@ -724,7 +724,7 @@ void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
          (cpi->common.width != cpi->resize_pending_params.width ||
           cpi->common.height != cpi->resize_pending_params.height));
 
-    if (cpi->use_svc) {
+    if (cpi->ppi->use_svc) {
 // TODO(kyslov) Enable when SVC temporal denosing is implemented
 #if 0
       const int svc_buf_shift =
@@ -746,7 +746,7 @@ void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
                                    cpi->refresh_golden_frame,
                                    cpi->refresh_last_frame, cpi->alt_fb_idx,
                                    cpi->gld_fb_idx, cpi->lst_fb_idx))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                            "Failed to re-allocate denoiser for SVC");
 #endif
     }
diff --git a/third_party/libaom/source/libaom/av1/encoder/bitstream.c b/third_party/libaom/source/libaom/av1/encoder/bitstream.c
index 2b583790ff..85c0183b17 100644
--- a/third_party/libaom/source/libaom/av1/encoder/bitstream.c
+++ b/third_party/libaom/source/libaom/av1/encoder/bitstream.c
@@ -41,6 +41,7 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/segmentation.h"
@@ -185,12 +186,13 @@ static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
     }
 
     assert(bsw > 0 && bsh > 0);
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      const int offsetr = blk_row + row;
       for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        int offsetr = blk_row + row;
-        int offsetc = blk_col + col;
+        const int offsetc = blk_col + col;
         write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
       }
+    }
   }
 }
 
@@ -313,14 +315,16 @@ static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
 
 static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
                                            const MACROBLOCKD *xd, int lf_id,
-                                           int delta_lflevel, aom_writer *w) {
+                                           int delta_lflevel,
+                                           int delta_lf_multi, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
   int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
 
-  if (cm->delta_q_info.delta_lf_multi) {
+  if (delta_lf_multi) {
     assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
                                                          : FRAME_LF_COUNT - 2));
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
@@ -380,7 +384,6 @@ static AOM_INLINE void pack_txb_tokens(
 #if CONFIG_RD_DEBUG
     TOKEN_STATS tmp_token_stats;
     init_token_stats(&tmp_token_stats);
-    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
     token_stats->cost += tmp_token_stats.cost;
 #endif
   } else {
@@ -388,14 +391,17 @@ static AOM_INLINE void pack_txb_tokens(
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int step = bsh * bsw;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
 
     assert(bsw > 0 && bsh > 0);
 
-    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
-      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
-        const int offsetr = blk_row + r;
+    for (int r = 0; r < row_end; r += bsh) {
+      const int offsetr = blk_row + r;
+      for (int c = 0; c < col_end; c += bsw) {
         const int offsetc = blk_col + c;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
         pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
                         bit_depth, block, offsetr, offsetc, sub_txs,
                         token_stats);
@@ -445,7 +451,7 @@ int av1_neg_interleave(int x, int ref, int max) {
   }
 }
 
-static AOM_INLINE void write_segment_id(AV1_COMP *cpi,
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
                                         const MB_MODE_INFO *const mbmi,
                                         aom_writer *w,
                                         const struct segmentation *seg,
@@ -454,7 +460,6 @@ static AOM_INLINE void write_segment_id(AV1_COMP *cpi,
   if (!seg->enabled || !seg->update_map) return;
 
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int cdf_num;
   const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
   const int mi_row = xd->mi_row;
@@ -613,8 +618,8 @@ static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
 }
 
 static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
-                                              const MACROBLOCKD *xd,
-                                              aom_writer *w) {
+                                              ThreadData *td, aom_writer *w) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
@@ -633,8 +638,8 @@ static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
           av1_extract_interp_filter(mbmi->interp_filters, dir);
       aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                        SWITCHABLE_FILTERS);
-      ++cm->cur_frame->interp_filter_selected[filter];
-      if (cm->seq_params.enable_dual_filter == 0) return;
+      ++td->interp_filter_selected[filter];
+      if (cm->seq_params->enable_dual_filter == 0) return;
     }
   }
 }
@@ -777,7 +782,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
+      write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w);
     }
   }
 
@@ -792,7 +797,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
+      write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w);
     }
   }
 }
@@ -874,7 +879,7 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
 
   // At the start of a superblock, mark that we haven't yet written CDEF
   // strengths for any of the CDEF units contained in this superblock.
-  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int sb_mask = (cm->seq_params->mib_size - 1);
   const int mi_row_in_sb = (xd->mi_row & sb_mask);
   const int mi_col_in_sb = (xd->mi_col & sb_mask);
   if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
@@ -889,7 +894,7 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
   const int index_mask = cdef_size;
   const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
-  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
 
@@ -909,9 +914,9 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
 }
 
 static AOM_INLINE void write_inter_segment_id(
-    AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg,
-    struct segmentation_probs *const segp, int skip, int preskip) {
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+    AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
+    const struct segmentation *const seg, struct segmentation_probs *const segp,
+    int skip, int preskip) {
   MB_MODE_INFO *const mbmi = xd->mi[0];
   AV1_COMMON *const cm = &cpi->common;
   const int mi_row = xd->mi_row;
@@ -923,7 +928,7 @@ static AOM_INLINE void write_inter_segment_id(
     } else {
       if (seg->segid_preskip) return;
       if (skip) {
-        write_segment_id(cpi, mbmi, w, seg, segp, 1);
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 1);
         if (seg->temporal_update) mbmi->seg_id_predicted = 0;
         return;
       }
@@ -933,35 +938,33 @@ static AOM_INLINE void write_inter_segment_id(
       aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
       aom_write_symbol(w, pred_flag, pred_cdf, 2);
       if (!pred_flag) {
-        write_segment_id(cpi, mbmi, w, seg, segp, 0);
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
       }
       if (pred_flag) {
         set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
                                mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
       }
     } else {
-      write_segment_id(cpi, mbmi, w, seg, segp, 0);
+      write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
     }
   }
 }
 
 // If delta q is present, writes delta_q index.
 // Also writes delta_q loop filter levels, if present.
-static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
+static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
+                                            MACROBLOCKD *const xd, int skip,
                                             aom_writer *w) {
-  AV1_COMMON *const cm = &cpi->common;
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
 
   if (delta_q_info->delta_q_present_flag) {
-    MACROBLOCK *const x = &cpi->td.mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
     const MB_MODE_INFO *const mbmi = xd->mi[0];
     const BLOCK_SIZE bsize = mbmi->bsize;
     const int super_block_upper_left =
-        ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+        ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
 
-    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+    if ((bsize != cm->seq_params->sb_size || skip == 0) &&
         super_block_upper_left) {
       assert(mbmi->current_qindex > 0);
       const int reduced_delta_qindex =
@@ -977,14 +980,14 @@ static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
             int reduced_delta_lflevel =
                 (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 delta_q_info->delta_lf_res;
-            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w);
             xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
               (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               delta_q_info->delta_lf_res;
-          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w);
           xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
@@ -992,12 +995,10 @@ static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
   }
 }
 
-static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
+static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
+                                                    MACROBLOCKD *const xd,
                                                     int is_keyframe,
                                                     aom_writer *w) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PREDICTION_MODE mode = mbmi->mode;
@@ -1020,7 +1021,7 @@ static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
   }
 
   // UV mode and UV angle delta.
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
     write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
     if (uv_mode == UV_CFL_PRED)
@@ -1082,9 +1083,10 @@ static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
                                x->mbmi_ext_frame);
 }
 
-static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+                                           aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
@@ -1099,7 +1101,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
   const int is_compound = has_second_ref(mbmi);
   int ref;
 
-  write_inter_segment_id(cpi, w, seg, segp, 0, 1);
+  write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1);
 
   write_skip_mode(cm, xd, segment_id, mbmi, w);
 
@@ -1107,18 +1109,18 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
   const int skip =
       mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
 
-  write_inter_segment_id(cpi, w, seg, segp, skip, 0);
+  write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0);
 
   write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, skip, w);
+  write_delta_q_params(cm, xd, skip, w);
 
   if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
   if (mbmi->skip_mode) return;
 
   if (!is_inter) {
-    write_intra_prediction_modes(cpi, 0, w);
+    write_intra_prediction_modes(cm, xd, 0, w);
   } else {
     int16_t mode_ctx;
 
@@ -1146,21 +1148,23 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
       for (ref = 0; ref < 1 + is_compound; ++ref) {
         nmv_context *nmvc = &ec_ctx->nmvc;
         const int_mv ref_mv = get_ref_mv(x, ref);
-        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+        av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
       }
     } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
       const int_mv ref_mv = get_ref_mv(x, 1);
-      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
     } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
       const int_mv ref_mv = get_ref_mv(x, 0);
-      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
     }
 
     if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
-        cpi->common.seq_params.enable_interintra_compound &&
+        cpi->common.seq_params->enable_interintra_compound &&
         is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
@@ -1187,7 +1191,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
     // Group B (1): interintra, compound_diffwtd, wedge
     if (has_second_ref(mbmi)) {
       const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                       cm->seq_params.enable_masked_compound;
+                                       cm->seq_params->enable_masked_compound;
 
       if (masked_compound_used) {
         const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
@@ -1201,7 +1205,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
         if (mbmi->compound_idx)
           assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
 
-        if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+        if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
           const int comp_index_ctx = get_comp_index_context(cm, xd);
           aom_write_symbol(w, mbmi->compound_idx,
                            ec_ctx->compound_index_cdf[comp_index_ctx], 2);
@@ -1234,7 +1238,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
         }
       }
     }
-    write_mb_interp_filter(cm, xd, w);
+    write_mb_interp_filter(cm, td, w);
   }
 }
 
@@ -1264,23 +1268,23 @@ static AOM_INLINE void write_mb_modes_kf(
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, 0);
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
 
   const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
 
   if (!seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, skip);
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, skip);
 
   write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, skip, w);
+  write_delta_q_params(cm, xd, skip, w);
 
   if (av1_allow_intrabc(cm)) {
     write_intrabc_info(xd, mbmi_ext_frame, w);
     if (is_intrabc_block(mbmi)) return;
   }
 
-  write_intra_prediction_modes(cpi, 1, w);
+  write_intra_prediction_modes(cm, xd, 1, w);
 }
 
 #if CONFIG_RD_DEBUG
@@ -1295,24 +1299,8 @@ static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
 static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
                                    int plane) {
   if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
-    int r, c;
     printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
            plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
-    printf("rd txb_coeff_cost_map\n");
-    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
-      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-        printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
-      }
-      printf("\n");
-    }
-
-    printf("pack txb_coeff_cost_map\n");
-    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
-      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-        printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
-      }
-      printf("\n");
-    }
     return 1;
   }
   return 0;
@@ -1376,13 +1364,14 @@ static AOM_INLINE void enc_dump_logs(
 }
 #endif  // ENC_MISMATCH_DEBUG
 
-static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+                                    aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *m = xd->mi[0];
 
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w);
+    write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w);
   } else {
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
@@ -1393,7 +1382,7 @@ static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
     enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
-    pack_inter_mode_mvs(cpi, w);
+    pack_inter_mode_mvs(cpi, td, w);
   }
 }
 
@@ -1426,18 +1415,17 @@ static AOM_INLINE void write_inter_txb_coeff(
   for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
     for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
       pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
-                      cm->seq_params.bit_depth, *block, blk_row, blk_col,
+                      cm->seq_params->bit_depth, *block, blk_row, blk_col,
                       max_tx_size, token_stats);
       *block += step;
     }
   }
 }
 
-static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
-                                      const TokenExtra **tok,
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+                                      aom_writer *w, const TokenExtra **tok,
                                       const TokenExtra *const tok_end) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->bsize;
@@ -1487,17 +1475,18 @@ static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
   }
 }
 
-static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                                     aom_writer *w, const TokenExtra **tok,
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+                                     const TileInfo *const tile, aom_writer *w,
+                                     const TokenExtra **tok,
                                      const TokenExtra *const tok_end,
                                      int mi_row, int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *xd = &td->mb.e_mbd;
   FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
   const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
   xd->mi = mi_params->mi_grid_base + grid_idx;
-  cpi->td.mb.mbmi_ext_frame =
+  td->mb.mbmi_ext_frame =
       cpi->mbmi_ext_info.frame_base +
       get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
                      cpi->mbmi_ext_info.stride);
@@ -1506,7 +1495,7 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->bsize;
-  assert(bsize <= cm->seq_params.sb_size ||
+  assert(bsize <= cm->seq_params->sb_size ||
          (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
 
   const int bh = mi_size_high[bsize];
@@ -1518,7 +1507,7 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
-  write_mbmi_b(cpi, w);
+  write_mbmi_b(cpi, td, w);
 
   for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
     const uint8_t palette_size_plane =
@@ -1567,10 +1556,10 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
   if (!mbmi->skip_txfm) {
     int start = aom_tell_size(w);
 
-    write_tokens_b(cpi, w, tok, tok_end);
+    write_tokens_b(cpi, &td->mb, w, tok, tok_end);
 
     const int end = aom_tell_size(w);
-    cpi->rc.coefficient_size += end - start;
+    td->coefficient_size += end - start;
   }
 }
 
@@ -1612,12 +1601,12 @@ static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
 }
 
 static AOM_INLINE void write_modes_sb(
-    AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
-    const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row,
-    int mi_col, BLOCK_SIZE bsize) {
+    AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
+    aom_writer *const w, const TokenExtra **tok,
+    const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   assert(bsize < BLOCK_SIZES_ALL);
   const int hbs = mi_size_wide[bsize] / 2;
   const int quarter_step = mi_size_wide[bsize] / 4;
@@ -1639,8 +1628,7 @@ static AOM_INLINE void write_modes_sb(
           const int runit_idx = rcol + rrow * rstride;
           const RestorationUnitInfo *rui =
               &cm->rst_info[plane].unit_info[runit_idx];
-          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
-                                           cpi->td.counts);
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, td->counts);
         }
       }
     }
@@ -1650,51 +1638,53 @@ static AOM_INLINE void write_modes_sb(
   write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       if (mi_row + hbs < mi_params->mi_rows)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       if (mi_col + hbs < mi_params->mi_cols)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_SPLIT:
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
                      subsize);
       break;
     case PARTITION_HORZ_A:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_HORZ_B:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
       break;
     case PARTITION_VERT_A:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_VERT_B:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
       break;
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
         if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
-        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
       break;
     case PARTITION_VERT_4:
@@ -1702,7 +1692,7 @@ static AOM_INLINE void write_modes_sb(
         int this_mi_col = mi_col + i * quarter_step;
         if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
 
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
       break;
     default: assert(0);
@@ -1712,12 +1702,12 @@ static AOM_INLINE void write_modes_sb(
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static AOM_INLINE void write_modes(AV1_COMP *const cpi,
+static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
                                    const TileInfo *const tile,
                                    aom_writer *const w, int tile_row,
                                    int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   const int mi_row_start = tile->mi_row_start;
   const int mi_row_end = tile->mi_row_end;
   const int mi_col_start = tile->mi_col_start;
@@ -1735,9 +1725,9 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi,
   }
 
   for (int mi_row = mi_row_start; mi_row < mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     const int sb_row_in_tile =
-        (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+        (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2;
     const TokenExtra *tok =
         cpi->token_info.tplist[tile_row][tile_col][sb_row_in_tile].start;
     const TokenExtra *tok_end =
@@ -1746,10 +1736,10 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi,
     av1_zero_left_context(xd);
 
     for (int mi_col = mi_col_start; mi_col < mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
-      cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
-      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
-                     cm->seq_params.sb_size);
+         mi_col += cm->seq_params->mib_size) {
+      td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+      write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col,
+                     cm->seq_params->sb_size);
     }
     assert(tok == tok_end);
   }
@@ -1758,7 +1748,7 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi,
 static AOM_INLINE void encode_restoration_mode(
     AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   assert(!cm->features.all_lossless);
-  if (!cm->seq_params.enable_restoration) return;
+  if (!cm->seq_params->enable_restoration) return;
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int all_none = 1, chroma_none = 1;
@@ -1789,9 +1779,9 @@ static AOM_INLINE void encode_restoration_mode(
     }
   }
   if (!all_none) {
-    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-           cm->seq_params.sb_size == BLOCK_128X128);
-    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
 
     RestorationInfo *rsi = &cm->rst_info[0];
 
@@ -1807,7 +1797,8 @@ static AOM_INLINE void encode_restoration_mode(
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
     if (s && !chroma_none) {
       aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
                                cm->rst_info[0].restoration_unit_size);
@@ -2040,7 +2031,7 @@ static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
 static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
                                    struct aom_write_bit_buffer *wb) {
   assert(!cm->features.coded_lossless);
-  if (!cm->seq_params.enable_cdef) return;
+  if (!cm->seq_params->enable_cdef) return;
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int i;
@@ -2093,7 +2084,7 @@ static AOM_INLINE void encode_quantization(
   }
 }
 
-static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
                                            struct aom_write_bit_buffer *wb) {
   int i, j;
   struct segmentation *seg = &cm->seg;
@@ -2102,17 +2093,9 @@ static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
   if (!seg->enabled) return;
 
   // Write update flags
-  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
-    assert(seg->update_map == 1);
-    seg->temporal_update = 0;
-    assert(seg->update_data == 1);
-  } else {
+  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
     aom_wb_write_bit(wb, seg->update_map);
-    if (seg->update_map) {
-      // Select the coding strategy (temporal or spatial)
-      av1_choose_segmap_coding_method(cm, xd);
-      aom_wb_write_bit(wb, seg->temporal_update);
-    }
+    if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update);
     aom_wb_write_bit(wb, seg->update_data);
   }
 
@@ -2163,11 +2146,11 @@ static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
 static AOM_INLINE void write_tile_info_max_tile(
     const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
   int width_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2);
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
   int height_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
-  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params->mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params->mib_size_log2;
   int size_sb, i;
   const CommonTileParams *const tiles = &cm->tiles;
 
@@ -2244,13 +2227,6 @@ static AOM_INLINE void write_ext_tile_info(
   }
 }
 
-// Stores the location and size of a tile's data in the bitstream.  Used for
-// later identifying identical tiles
-typedef struct TileBufferEnc {
-  uint8_t *data;
-  size_t size;
-} TileBufferEnc;
-
 static INLINE int find_identical_tile(
     const int tile_row, const int tile_col,
     TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
@@ -2314,7 +2290,7 @@ static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
 
 static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
                                             struct aom_write_bit_buffer *wb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) {
     assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
     return;
@@ -2341,7 +2317,7 @@ static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
   const int coded_height = cm->superres_upscaled_height - 1;
 
   if (frame_size_override) {
-    const SequenceHeader *seq_params = &cm->seq_params;
+    const SequenceHeader *seq_params = cm->seq_params;
     int num_bits_width = seq_params->num_bits_width;
     int num_bits_height = seq_params->num_bits_height;
     aom_wb_write_literal(wb, coded_width, num_bits_width);
@@ -2499,7 +2475,7 @@ static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
                                          struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(
       wb, cm->frame_presentation_time,
-      cm->seq_params.decoder_model_info.frame_presentation_time_length);
+      cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
 static AOM_INLINE void write_film_grain_params(
@@ -2537,15 +2513,15 @@ static AOM_INLINE void write_film_grain_params(
     aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
   }
 
-  if (!cm->seq_params.monochrome) {
+  if (!cm->seq_params->monochrome) {
     aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
   } else {
     assert(!pars->chroma_scaling_from_luma);
   }
 
-  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
-      ((cm->seq_params.subsampling_x == 1) &&
-       (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
+  if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->seq_params->subsampling_x == 1) &&
+       (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) {
     assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
   } else {
     aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
@@ -2841,12 +2817,11 @@ static int check_frame_refs_short_signaling(AV1_COMMON *const cm) {
 
 // New function based on HLS R18
 static AOM_INLINE void write_uncompressed_header_obu(
-    AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb,
+    AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
     struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const CommonQuantParams *quant_params = &cm->quant_params;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
 
@@ -2925,7 +2900,7 @@ static AOM_INLINE void write_uncompressed_header_obu(
 
     if (cm->superres_upscaled_width > seq_params->max_frame_width ||
         cm->superres_upscaled_height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Frame dimensions are larger than the maximum values");
     }
 
@@ -2947,24 +2922,24 @@ static AOM_INLINE void write_uncompressed_header_obu(
   }
 
   if (seq_params->decoder_model_info_present_flag) {
-    aom_wb_write_bit(wb, cm->buffer_removal_time_present);
-    if (cm->buffer_removal_time_present) {
+    aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present);
+    if (cpi->ppi->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
-          if (((seq_params->operating_point_idc[op_num] >>
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              ((seq_params->operating_point_idc[op_num] >>
                 cm->temporal_layer_id) &
                    0x1 &&
                (seq_params->operating_point_idc[op_num] >>
                 (cm->spatial_layer_id + 8)) &
-                   0x1) ||
-              seq_params->operating_point_idc[op_num] == 0) {
+                   0x1)) {
             aom_wb_write_unsigned_literal(
                 wb, cm->buffer_removal_times[op_num],
                 seq_params->decoder_model_info.buffer_removal_time_length);
             cm->buffer_removal_times[op_num]++;
             if (cm->buffer_removal_times[op_num] == 0) {
-              aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+              aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                                  "buffer_removal_time overflowed");
             }
           }
@@ -3051,7 +3026,7 @@ static AOM_INLINE void write_uncompressed_header_obu(
               1;
           if (delta_frame_id_minus_1 < 0 ||
               delta_frame_id_minus_1 >= (1 << diff_len)) {
-            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_ERROR,
                                "Invalid delta_frame_id_minus_1");
           }
           aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
@@ -3088,8 +3063,8 @@ static AOM_INLINE void write_uncompressed_header_obu(
 
   write_tile_info(cm, saved_wb, wb);
   encode_quantization(quant_params, av1_num_planes(cm),
-                      cm->seq_params.separate_uv_delta_q, wb);
-  encode_segmentation(cm, xd, wb);
+                      cm->seq_params->separate_uv_delta_q, wb);
+  encode_segmentation(cm, wb);
 
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
@@ -3288,11 +3263,11 @@ static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
 }
 
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
-                              OBU_TYPE obu_type, int obu_extension,
-                              uint8_t *const dst) {
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst) {
   if (level_params->keep_level_stats &&
       (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
-    ++level_params->frame_header_count;
+    ++(*frame_header_count);
 
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
@@ -3326,8 +3301,8 @@ int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
   return AOM_CODEC_OK;
 }
 
-static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
-                          uint8_t *data) {
+size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+                       uint8_t *data) {
   const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
   const size_t move_dst_offset = length_field_size + obu_header_size;
   const size_t move_src_offset = obu_header_size;
@@ -3426,12 +3401,12 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
   return size;
 }
 
-static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd,
                                        struct aom_write_bit_buffer *saved_wb,
                                        uint8_t *const dst,
                                        int append_trailing_bits) {
   struct aom_write_bit_buffer wb = { dst, 0 };
-  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  write_uncompressed_header_obu(cpi, xd, saved_wb, &wb);
   if (append_trailing_bits) add_trailing_bits(&wb);
   return aom_wb_bytes_written(&wb);
 }
@@ -3455,12 +3430,6 @@ static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
   return size;
 }
 
-typedef struct {
-  uint8_t *frame_header;
-  size_t obu_header_byte_offset;
-  size_t total_length;
-} FrameHeaderInfo;
-
 extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
                                                 const char *filename);
 
@@ -3473,16 +3442,17 @@ typedef struct {
 static uint32_t init_large_scale_tile_obu_header(
     AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
     LargeTileFrameOBU *lst_obu) {
-  AV1LevelParams *const level_params = &cpi->level_params;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
   CurrentFrame *const current_frame = &cpi->common.current_frame;
   // For large_scale_tile case, we always have only one tile group, so it can
   // be written as an OBU_FRAME.
   const OBU_TYPE obu_type = OBU_FRAME;
-  lst_obu->tg_hdr_size = av1_write_obu_header(level_params, obu_type, 0, *data);
+  lst_obu->tg_hdr_size = av1_write_obu_header(
+      level_params, &cpi->frame_header_count, obu_type, 0, *data);
   *data += lst_obu->tg_hdr_size;
 
   const uint32_t frame_header_size =
-      write_frame_header_obu(cpi, saved_wb, *data, 0);
+      write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0);
   *data += frame_header_size;
   lst_obu->frame_header_size = frame_header_size;
   // (yunqing) This test ensures the correctness of large scale tile coding.
@@ -3520,7 +3490,7 @@ static void write_large_scale_tile_obu_size(
   *total_size += lst_obu->tg_hdr_size;
   const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
   const size_t length_field_size =
-      obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+      av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
   if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
       AOM_CODEC_OK)
     assert(0);
@@ -3551,6 +3521,7 @@ static void write_large_scale_tile_obu(
   const int tile_rows = tiles->rows;
   unsigned int tile_size = 0;
 
+  av1_reset_pack_bs_thread_data(&cpi->td);
   for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
     TileInfo tile_info;
     const int is_last_col = (tile_col == tile_cols - 1);
@@ -3579,7 +3550,7 @@ static void write_large_scale_tile_obu(
       mode_bc.allow_update_cdf =
           mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
       aom_start_encode(&mode_bc, buf->data + data_offset);
-      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+      write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col);
       aom_stop_encode(&mode_bc);
       tile_size = mode_bc.pos;
       buf->size = tile_size;
@@ -3627,6 +3598,7 @@ static void write_large_scale_tile_obu(
       *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
     }
   }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
 }
 
 // Packs information in the obu header for large scale tiles.
@@ -3656,147 +3628,236 @@ static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
   return total_size;
 }
 
+// Writes obu, tile group and uncompressed headers to bitstream.
+void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
+  const int tg_size =
+      (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
+
+  // Write Tile group, frame and OBU header
+  // A new tile group begins at this tile.  Write the obu header and
+  // tile group header
+  const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+  *curr_tg_hdr_size = av1_write_obu_header(
+      &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+      pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
+  pack_bs_params->obu_header_size = *curr_tg_hdr_size;
+
+  if (cpi->num_tg == 1)
+    *curr_tg_hdr_size += write_frame_header_obu(
+        cpi, xd, pack_bs_params->saved_wb,
+        pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0);
+  *curr_tg_hdr_size += write_tile_group_header(
+      pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx,
+      AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
+      (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
+  *pack_bs_params->total_size += *curr_tg_hdr_size;
+}
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td,
+                        PackBSParams *const pack_bs_params) {
+  aom_writer mode_bc;
+  AV1_COMMON *const cm = &cpi->common;
+  int tile_row = pack_bs_params->tile_row;
+  int tile_col = pack_bs_params->tile_col;
+  uint32_t *const total_size = pack_bs_params->total_size;
+  TileInfo tile_info;
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  mode_bc.allow_update_cdf = 1;
+  mode_bc.allow_update_cdf =
+      mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+
+  unsigned int tile_size;
+
+  const int num_planes = av1_num_planes(cm);
+  av1_reset_loop_restoration(&td->mb.e_mbd, num_planes);
+
+  pack_bs_params->buf.data = pack_bs_params->dst + *total_size;
+
+  // The last tile of the tile group does not have a header.
+  if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4;
+
+  // Pack tile data
+  aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size);
+  write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col);
+  aom_stop_encode(&mode_bc);
+  tile_size = mode_bc.pos;
+  assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+  pack_bs_params->buf.size = tile_size;
+
+  // Write tile size
+  if (!pack_bs_params->is_last_tile_in_tg) {
+    // size of this tile
+    mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+  }
+}
+
+void av1_write_last_tile_info(
+    AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) {
+  // write current tile group size
+  const uint32_t obu_payload_size =
+      (uint32_t)(*curr_tg_data_size) - obu_header_size;
+  const size_t length_field_size =
+      av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+  if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                              curr_tg_start) != AOM_CODEC_OK) {
+    assert(0);
+  }
+  *curr_tg_data_size += (int)length_field_size;
+  *total_size += (uint32_t)length_field_size;
+  *tile_data_start += length_field_size;
+  if (cpi->num_tg == 1) {
+    // if this tg is combined with the frame header then update saved
+    // frame header base offset according to length field size
+    saved_wb->bit_buffer += length_field_size;
+  }
+
+  if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) {
+    // Make room for a duplicate Frame Header OBU.
+    memmove(curr_tg_start + fh_info->total_length, curr_tg_start,
+            *curr_tg_data_size);
+
+    // Insert a copy of the Frame Header OBU.
+    memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length);
+
+    // Force context update tile to be the first tile in error
+    // resilient mode as the duplicate frame headers will have
+    // context_update_tile_id set to 0
+    *largest_tile_id = 0;
+
+    // Rewrite the OBU header to change the OBU type to Redundant Frame
+    // Header.
+    av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
+                         OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
+                         &curr_tg_start[fh_info->obu_header_byte_offset]);
+
+    *curr_tg_data_size += (int)(fh_info->total_length);
+    *total_size += (uint32_t)(fh_info->total_length);
+  }
+  *is_first_tg = 0;
+}
+
+void av1_reset_pack_bs_thread_data(ThreadData *const td) {
+  td->coefficient_size = 0;
+  td->max_mv_magnitude = 0;
+  av1_zero(td->interp_filter_selected);
+}
+
+void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi,
+                                        ThreadData const *td) {
+  int do_max_mv_magnitude_update = 1;
+  cpi->rc.coefficient_size += td->coefficient_size;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Disable max_mv_magnitude update for parallel frames based on update flag.
+  if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0;
+#endif
+
+  if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update)
+    cpi->mv_search_params.max_mv_magnitude =
+        AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude);
+
+  for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++)
+    cpi->common.cur_frame->interp_filter_selected[filter] +=
+        td->interp_filter_selected[filter];
+}
+
 // Store information related to each default tile in the OBU header.
 static void write_tile_obu(
     AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
-    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
     const FrameHeaderInfo *fh_info, int *const largest_tile_id,
     unsigned int *max_tile_size, uint32_t *const obu_header_size,
     uint8_t **tile_data_start) {
   AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const CommonTileParams *const tiles = &cm->tiles;
-  AV1LevelParams *const level_params = &cpi->level_params;
-  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
-  unsigned int tile_size = 0;
   // Fixed size tile groups for the moment
   const int num_tg_hdrs = cpi->num_tg;
   const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
   int tile_count = 0;
-  int curr_tg_data_size = 0;
-  uint8_t *data = dst;
+  size_t curr_tg_data_size = 0;
+  uint8_t *tile_data_curr = dst;
   int new_tg = 1;
-  int first_tg = 1;
+  int is_first_tg = 1;
 
+  av1_reset_pack_bs_thread_data(&cpi->td);
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
-      aom_writer mode_bc;
       const int tile_idx = tile_row * tile_cols + tile_col;
-      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
       TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-      int is_last_tile_in_tg = 0;
 
+      int is_last_tile_in_tg = 0;
       if (new_tg) {
-        data = dst + *total_size;
-
-        // A new tile group begins at this tile.  Write the obu header and
-        // tile group header
-        const OBU_TYPE obu_type =
-            (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
-        curr_tg_data_size = av1_write_obu_header(level_params, obu_type,
-                                                 obu_extension_header, data);
-        *obu_header_size = curr_tg_data_size;
-
-        if (num_tg_hdrs == 1)
-          curr_tg_data_size += write_frame_header_obu(
-              cpi, saved_wb, data + curr_tg_data_size, 0);
-        curr_tg_data_size += write_tile_group_header(
-            data + curr_tg_data_size, tile_idx,
-            AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
-            (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
-        *total_size += curr_tg_data_size;
-        *tile_data_start += curr_tg_data_size;
-        new_tg = 0;
+        tile_data_curr = dst + *total_size;
         tile_count = 0;
       }
       tile_count++;
-      TileInfo tile_info;
-      av1_tile_set_col(&tile_info, cm, tile_col);
-      av1_tile_set_row(&tile_info, cm, tile_row);
 
-      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1))
         is_last_tile_in_tg = 1;
-        new_tg = 1;
-      } else {
-        is_last_tile_in_tg = 0;
-      }
 
-      buf->data = dst + *total_size;
+      xd->tile_ctx = &this_tile->tctx;
 
-      // The last tile of the tile group does not have a header.
-      if (!is_last_tile_in_tg) *total_size += 4;
+      // PackBSParams stores all parameters required to pack tile and header
+      // info.
+      PackBSParams pack_bs_params;
+      pack_bs_params.dst = dst;
+      pack_bs_params.curr_tg_hdr_size = 0;
+      pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg;
+      pack_bs_params.new_tg = new_tg;
+      pack_bs_params.obu_extn_header = obu_extn_header;
+      pack_bs_params.obu_header_size = 0;
+      pack_bs_params.saved_wb = saved_wb;
+      pack_bs_params.tile_col = tile_col;
+      pack_bs_params.tile_row = tile_row;
+      pack_bs_params.tile_data_curr = tile_data_curr;
+      pack_bs_params.total_size = total_size;
 
-      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-      mode_bc.allow_update_cdf = 1;
-      mode_bc.allow_update_cdf =
-          mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
-      const int num_planes = av1_num_planes(cm);
-      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+      if (new_tg)
+        av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx);
 
-      aom_start_encode(&mode_bc, dst + *total_size);
-      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
-      aom_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
-      assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+      av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params);
 
-      curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
-      buf->size = tile_size;
-      if (tile_size > *max_tile_size) {
-        *largest_tile_id = tile_cols * tile_row + tile_col;
-        *max_tile_size = tile_size;
+      if (new_tg) {
+        curr_tg_data_size = pack_bs_params.curr_tg_hdr_size;
+        *tile_data_start += pack_bs_params.curr_tg_hdr_size;
+        *obu_header_size = pack_bs_params.obu_header_size;
+        new_tg = 0;
       }
+      if (is_last_tile_in_tg) new_tg = 1;
 
-      if (!is_last_tile_in_tg) {
-        // size of this tile
-        mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
-      } else {
-        // write current tile group size
-        const uint32_t obu_payload_size = curr_tg_data_size - *obu_header_size;
-        const size_t length_field_size =
-            obu_memmove(*obu_header_size, obu_payload_size, data);
-        if (av1_write_uleb_obu_size(*obu_header_size, obu_payload_size, data) !=
-            AOM_CODEC_OK) {
-          assert(0);
-        }
-        curr_tg_data_size += (int)length_field_size;
-        *total_size += (uint32_t)length_field_size;
-        *tile_data_start += length_field_size;
-        if (num_tg_hdrs == 1) {
-          // if this tg is combined with the frame header then update saved
-          // frame header base offset accroding to length field size
-          saved_wb->bit_buffer += length_field_size;
-        }
+      curr_tg_data_size +=
+          (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4));
 
-        if (!first_tg && cm->features.error_resilient_mode) {
-          // Make room for a duplicate Frame Header OBU.
-          memmove(data + fh_info->total_length, data, curr_tg_data_size);
-
-          // Insert a copy of the Frame Header OBU.
-          memcpy(data, fh_info->frame_header, fh_info->total_length);
-
-          // Force context update tile to be the first tile in error
-          // resiliant mode as the duplicate frame headers will have
-          // context_update_tile_id set to 0
-          *largest_tile_id = 0;
-
-          // Rewrite the OBU header to change the OBU type to Redundant Frame
-          // Header.
-          av1_write_obu_header(level_params, OBU_REDUNDANT_FRAME_HEADER,
-                               obu_extension_header,
-                               &data[fh_info->obu_header_byte_offset]);
-
-          data += fh_info->total_length;
-
-          curr_tg_data_size += (int)(fh_info->total_length);
-          *total_size += (uint32_t)(fh_info->total_length);
-        }
-        first_tg = 0;
+      if (pack_bs_params.buf.size > *max_tile_size) {
+        *largest_tile_id = tile_idx;
+        *max_tile_size = (unsigned int)pack_bs_params.buf.size;
       }
 
-      *total_size += tile_size;
+      if (is_last_tile_in_tg)
+        av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size,
+                                 tile_data_curr, total_size, tile_data_start,
+                                 largest_tile_id, &is_first_tg,
+                                 *obu_header_size, obu_extn_header);
+      *total_size += (uint32_t)pack_bs_params.buf.size;
     }
   }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
 }
 
 // Write total buffer size and related information into the OBU header for
@@ -3854,6 +3915,24 @@ static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst,
   }
 }
 
+// As per the experiments, single-thread bitstream packing is better for
+// frames with a smaller bitstream size. This behavior is due to setup time
+// overhead of multithread function would be more than that of time required
+// to pack the smaller bitstream of such frames. We set a threshold on the
+// total absolute sum of transform coeffs to detect such frames and disable
+// Multithreading.
+int enable_pack_bitstream_mt(const TileDataEnc *tile_data, int num_tiles,
+                             int num_workers) {
+  if (AOMMIN(num_workers, num_tiles) <= 1) return 0;
+
+  const int num_work_sqr = num_workers * num_workers;
+  const uint64_t thresh = 50;
+  uint64_t frame_abs_sum_level = 0;
+  for (int idx = 0; idx < num_tiles; idx++)
+    frame_abs_sum_level += tile_data[idx].abs_sum_level;
+  return ((frame_abs_sum_level > (num_work_sqr * thresh) / (num_workers - 1)));
+}
+
 static INLINE uint32_t pack_tiles_in_tg_obus(
     AV1_COMP *const cpi, uint8_t *const dst,
     struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
@@ -3863,16 +3942,25 @@ static INLINE uint32_t pack_tiles_in_tg_obus(
   unsigned int max_tile_size = 0;
   uint32_t obu_header_size = 0;
   uint8_t *tile_data_start = dst;
-
-  write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, fh_info,
-                 largest_tile_id, &max_tile_size, &obu_header_size,
-                 &tile_data_start);
-
+  const int num_workers = cpi->mt_info.num_mod_workers[MOD_PACK_BS];
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
-  const int have_tiles = tile_cols * tile_rows > 1;
+  const int num_tiles = tile_rows * tile_cols;
+
+  const int enable_mt =
+      enable_pack_bitstream_mt(cpi->tile_data, num_tiles, num_workers);
 
-  if (have_tiles)
+  if (enable_mt) {
+    av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                          fh_info, largest_tile_id, &max_tile_size,
+                          &obu_header_size, &tile_data_start);
+  } else {
+    write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                   fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
+                   &tile_data_start);
+  }
+
+  if (num_tiles > 1)
     write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size,
                         max_tile_size, obu_header_size, tile_data_start);
   return total_size;
@@ -3887,6 +3975,9 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   const CommonTileParams *const tiles = &cm->tiles;
   *largest_tile_id = 0;
 
+  // Select the coding strategy (temporal or spatial)
+  if (cm->seg.enabled) av1_choose_segmap_coding_method(cm, &cpi->td.mb.e_mbd);
+
   if (tiles->large_scale)
     return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
                                              largest_tile_id);
@@ -3926,18 +4017,20 @@ static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
           (cm->current_frame.frame_type != KEY_FRAME &&
            current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
           current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
-        obu_header_size =
-            av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst);
+        obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
+                                               &cpi->frame_header_count,
+                                               OBU_METADATA, 0, dst);
         obu_payload_size =
             av1_write_metadata_obu(current_metadata, dst + obu_header_size);
-        length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
+        length_field_size =
+            av1_obu_memmove(obu_header_size, obu_payload_size, dst);
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
             AOM_CODEC_OK) {
           const size_t obu_size = obu_header_size + obu_payload_size;
           dst += obu_size + length_field_size;
           total_bytes_written += obu_size + length_field_size;
         } else {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+          aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
                              "Error writing metadata OBU size");
         }
       }
@@ -3951,7 +4044,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   uint8_t *data = dst;
   uint32_t data_size;
   AV1_COMMON *const cm = &cpi->common;
-  AV1LevelParams *const level_params = &cpi->level_params;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
   uint32_t obu_header_size = 0;
   uint32_t obu_payload_size = 0;
   FrameHeaderInfo fh_info = { NULL, 0, 0 };
@@ -3967,19 +4060,19 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   bitstream_queue_reset_write();
 #endif
 
-  level_params->frame_header_count = 0;
+  cpi->frame_header_count = 0;
 
   // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
   if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
-    obu_header_size =
-        av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data);
+    obu_header_size = av1_write_obu_header(
+        level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
 
     obu_payload_size =
-        av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size);
+        av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
-        obu_memmove(obu_header_size, obu_payload_size, data);
+        av1_obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
@@ -3998,12 +4091,13 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   if (write_frame_header) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
-    obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER,
-                                           obu_extension_header, data);
-    obu_payload_size =
-        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
+    obu_header_size =
+        av1_write_obu_header(level_params, &cpi->frame_header_count,
+                             OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
+                                              data + obu_header_size, 1);
 
-    length_field = obu_memmove(obu_header_size, obu_payload_size, data);
+    length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
diff --git a/third_party/libaom/source/libaom/av1/encoder/bitstream.h b/third_party/libaom/source/libaom/av1/encoder/bitstream.h
index df35ecccfa..e32cd3bd19 100644
--- a/third_party/libaom/source/libaom/av1/encoder/bitstream.h
+++ b/third_party/libaom/source/libaom/av1/encoder/bitstream.h
@@ -16,9 +16,67 @@
 extern "C" {
 #endif
 
-#include "av1/encoder/encoder.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/level.h"
+#include "aom_dsp/bitwriter.h"
 
 struct aom_write_bit_buffer;
+struct AV1_COMP;
+struct ThreadData;
+
+/*!\cond */
+
+// Stores the location and size of a tile's data in the bitstream.  Used for
+// later identifying identical tiles
+typedef struct {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
+typedef struct {
+  struct aom_write_bit_buffer *saved_wb;  // Bit stream buffer writer structure
+  TileBufferEnc buf;     // Structure to hold bitstream buffer and size
+  uint32_t *total_size;  // Size of the bitstream buffer for the tile in bytes
+  uint8_t *dst;          // Base address of tile bitstream buffer
+  uint8_t *tile_data_curr;   // Base address of tile-group bitstream buffer
+  size_t tile_buf_size;      // Available bitstream buffer for the tile in bytes
+  uint8_t obu_extn_header;   // Presence of OBU extension header
+  uint32_t obu_header_size;  // Size of the OBU header
+  int curr_tg_hdr_size;      // Size of the obu, tg, frame headers
+  int tile_size_mi;          // Tile size in mi units
+  int tile_row;              // Number of tile rows
+  int tile_col;              // Number of tile columns
+  int is_last_tile_in_tg;    // Flag to indicate last tile in a tile-group
+  int new_tg;                // Flag to indicate starting of a new tile-group
+} PackBSParams;
+
+typedef struct {
+  uint64_t abs_sum_level;
+  uint16_t tile_idx;
+} PackBSTileOrder;
+
+// Pack bitstream data for pack bitstream multi-threading.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+  // Tile order structure of pack bitstream multithreading.
+  PackBSTileOrder pack_bs_tile_order[MAX_TILES];
+
+  // Index of next job to be processed.
+  int next_job_idx;
+} AV1EncPackBSSync;
+
+/*!\endcond */
 
 // Writes only the OBU Sequence Header payload, and returns the size of the
 // payload written to 'dst'. This function does not write the OBU header, the
@@ -29,23 +87,44 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
 // Writes the OBU header byte, and the OBU header extension byte when
 // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
-                              OBU_TYPE obu_type, int obu_extension,
-                              uint8_t *const dst);
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst);
 
 int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
                             uint8_t *dest);
 
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td,
+                        PackBSParams *const pack_bs_params);
+
+void av1_write_last_tile_info(
+    struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header);
+
 /*!\brief Pack the bitstream for one frame
  *
  * \ingroup high_level_algo
  * \callgraph
  */
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id);
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
                        TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
 
+void av1_reset_pack_bs_thread_data(struct ThreadData *const td);
+
+void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi,
+                                        struct ThreadData const *td);
+
+void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi,
+                                   MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/libaom/source/libaom/av1/encoder/block.h b/third_party/libaom/source/libaom/av1/encoder/block.h
index 59353cfac3..aaf3654a5f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/block.h
+++ b/third_party/libaom/source/libaom/av1/encoder/block.h
@@ -102,7 +102,7 @@ typedef struct {
  */
 typedef struct macroblock_plane {
   //! Stores source - pred so the txfm can be computed later
-  DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]);
+  int16_t *src_diff;
   //! Dequantized coefficients
   tran_low_t *dqcoeff;
   //! Quantized coefficients
@@ -778,6 +778,23 @@ typedef struct {
   /**@}*/
 } MvCosts;
 
+/*! \brief Holds mv costs for intrabc.
+ */
+typedef struct {
+  /*! Costs for coding the joint mv. */
+  int joint_mv[MV_JOINTS];
+
+  /*! \brief Cost of transmitting the actual motion vector.
+   *  dv_costs_alloc[0][i] is the cost of motion vector with horizontal
+   * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of
+   * motion vector with vertical component (mv_col) equal to i - MV_MAX.
+   */
+  int dv_costs_alloc[2][MV_VALS];
+
+  /*! Points to the middle of \ref dv_costs_alloc. */
+  int *dv_costs[2];
+} IntraBCMVCosts;
+
 /*! \brief Holds the costs needed to encode the coefficients
  */
 typedef struct {
@@ -817,6 +834,14 @@ typedef struct {
   int lighting_change;
   int low_sumdiff;
 } CONTENT_STATE_SB;
+
+// Structure to hold pixel level gradient info.
+typedef struct {
+  uint16_t abs_dx_abs_dy_sum;
+  int8_t hist_bin_idx;
+  bool is_dx_zero;
+} PixelLevelGradientInfo;
+
 /*!\endcond */
 
 /*! \brief Encoder's parameters related to the current coding block.
@@ -945,6 +970,11 @@ typedef struct macroblock {
   //! multipliers for motion search.
   MvCosts *mv_costs;
 
+  /*! The rate needed to encode a new motion vector to the bitstream in intrabc
+   *  mode.
+   */
+  IntraBCMVCosts *dv_costs;
+
   //! The rate needed to signal the txfm coefficients to the bitstream.
   CoeffCosts coeff_costs;
   /**@}*/
@@ -1014,6 +1044,10 @@ typedef struct macroblock {
   int pred_mv_sad[REF_FRAMES];
   //! The minimum of \ref pred_mv_sad.
   int best_pred_mv_sad;
+  //! The sad of the 1st mv ref (nearest).
+  int pred_mv0_sad[REF_FRAMES];
+  //! The sad of the 2nd mv ref (near).
+  int pred_mv1_sad[REF_FRAMES];
 
   /*! \brief Disables certain ref frame pruning based on tpl.
    *
@@ -1092,8 +1126,7 @@ typedef struct macroblock {
    * In the second pass, we retry the winner modes with more thorough txfm
    * options.
    */
-  WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA,
-                                           MAX_WINNER_MODE_COUNT_INTER)];
+  WinnerModeStats *winner_mode_stats;
   //! Tracks how many winner modes there are.
   int winner_mode_count;
 
@@ -1147,10 +1180,20 @@ typedef struct macroblock {
    */
   IntraBCHashInfo intrabc_hash_info;
 
-  /*! \brief Whether to reuse the mode stored in intermode_cache. */
-  int use_intermode_cache;
-  /*! \brief The mode to reuse during \ref av1_rd_pick_inter_mode. */
-  const MB_MODE_INFO *intermode_cache;
+  /*! \brief Whether to reuse the mode stored in mb_mode_cache. */
+  int use_mb_mode_cache;
+  /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and
+   *  \ref av1_rd_pick_inter_mode. */
+  const MB_MODE_INFO *mb_mode_cache;
+  /*! \brief Pointer to the buffer which caches gradient information.
+   *
+   * Pointer to the array of structures to store gradient information of each
+   * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+   * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+   */
+  PixelLevelGradientInfo *pixel_gradient_info;
+  /*! \brief Flags indicating the availability of cached gradient info. */
+  bool is_sb_gradient_cached[PLANE_TYPES];
   /**@}*/
 
   /*****************************************************************************
@@ -1195,6 +1238,8 @@ typedef struct macroblock {
    * Used in REALTIME coding mode to enhance the visual quality at the boundary
    * of moving color objects.
    */
+  uint8_t color_sensitivity_sb[2];
+  //! Color sensitivity flag for the coding block.
   uint8_t color_sensitivity[2];
   /**@}*/
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/compound_type.c b/third_party/libaom/source/libaom/av1/encoder/compound_type.c
index aacb7fc88a..00fa3890bf 100644
--- a/third_party/libaom/source/libaom/av1/encoder/compound_type.c
+++ b/third_party/libaom/source/libaom/av1/encoder/compound_type.c
@@ -48,31 +48,31 @@ static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
     if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
   }
 
-  // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD
-  for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
-       comp_type++) {
-    comp_rate[comp_type] = st->rate[comp_type];
-    comp_dist[comp_type] = st->dist[comp_type];
-    comp_model_rate[comp_type] = st->model_rate[comp_type];
-    comp_model_dist[comp_type] = st->model_dist[comp_type];
-    comp_rs2[comp_type] = st->comp_rs2[comp_type];
-  }
-
-  // For compound wedge/segment, reuse data only if NEWMV is not present in
-  // either of the directions
+  int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 };
+  // For compound wedge, reuse data if newmv search is disabled when NEWMV is
+  // present or if NEWMV is not present in either of the directions
   if ((!have_newmv_in_inter_mode(mi->mode) &&
        !have_newmv_in_inter_mode(st->mode)) ||
-      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) {
-    memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
-           sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
-           sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_model_rate[COMPOUND_WEDGE], &st->model_rate[COMPOUND_WEDGE],
-           sizeof(comp_model_rate[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_model_dist[COMPOUND_WEDGE], &st->model_dist[COMPOUND_WEDGE],
-           sizeof(comp_model_dist[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_rs2[COMPOUND_WEDGE], &st->comp_rs2[COMPOUND_WEDGE],
-           sizeof(comp_rs2[COMPOUND_WEDGE]) * 2);
+      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search))
+    reuse_data[COMPOUND_WEDGE] = 1;
+  // For compound diffwtd, reuse data if fast search is enabled (no newmv search
+  // when NEWMV is present) or if NEWMV is not present in either of the
+  // directions
+  if (cpi->sf.inter_sf.enable_fast_compound_mode_search ||
+      (!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)))
+    reuse_data[COMPOUND_DIFFWTD] = 1;
+
+  // Store the stats for the different compound types
+  for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES;
+       comp_type++) {
+    if (reuse_data[comp_type]) {
+      comp_rate[comp_type] = st->rate[comp_type];
+      comp_dist[comp_type] = st->dist[comp_type];
+      comp_model_rate[comp_type] = st->model_rate[comp_type];
+      comp_model_dist[comp_type] = st->model_dist[comp_type];
+      comp_rs2[comp_type] = st->comp_rs2[comp_type];
+    }
   }
   return 1;
 }
@@ -166,14 +166,14 @@ static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
   // for all codebooks; experiment with other quadrant combinations for
   // 0, 90 and 135 degrees also.
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
-  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
-                          pred0 + bh_by2 * stride0 + bw_by2, stride0,
-                          &esq[0][1]);
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
-  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
-                          pred1 + bh_by2 * stride1 + bw_by2, stride0,
-                          &esq[1][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred0 + bh_by2 * stride0 + bw_by2, stride0,
+                               &esq[0][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred1 + bh_by2 * stride1 + bw_by2, stride0,
+                               &esq[1][1]);
 
   tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
   br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
@@ -314,7 +314,7 @@ static int64_t pick_interinter_wedge(
   int8_t wedge_sign = 0;
 
   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-  assert(cpi->common.seq_params.enable_masked_compound);
+  assert(cpi->common.seq_params->enable_masked_compound);
 
   if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
@@ -392,7 +392,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   const MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(av1_is_wedge_used(bsize));
-  assert(cpi->common.seq_params.enable_interintra_compound);
+  assert(cpi->common.seq_params->enable_interintra_compound);
 
   const struct buf_2d *const src = &x->plane[0].src;
   const int bw = block_size_wide[bsize];
@@ -836,7 +836,7 @@ static INLINE int compute_valid_comp_types(MACROBLOCK *x,
   const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
   const int try_distwtd_comp =
       ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
-       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+       cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 &&
        cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
 
   // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
@@ -1058,10 +1058,12 @@ static int64_t masked_compound_type_rd(
   if (compound_type == COMPOUND_WEDGE) {
     unsigned int sse;
     if (is_cur_buf_hbd(xd))
-      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
-                                  CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
+      (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+                                       CONVERT_TO_BYTEPTR(*preds1), *strides,
+                                       &sse);
     else
-      (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse);
+      (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides,
+                                       &sse);
     const unsigned int mse =
         ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
     // If two predictors are very similar, skip wedge compound mode search
@@ -1164,7 +1166,8 @@ static int64_t masked_compound_type_rd(
     assert(comp_dist[compound_type] != INT64_MAX);
     // When disable_interinter_wedge_newmv_search is set, motion refinement is
     // disabled. Hence rate and distortion can be reused in this case as well
-    assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
+    assert(IMPLIES((have_newmv_in_inter_mode(this_mode) &&
+                    (compound_type == COMPOUND_WEDGE)),
                    cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
     assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
     assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
@@ -1338,11 +1341,12 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
         if (have_newmv_in_inter_mode(this_mode)) {
           InterPredParams inter_pred_params;
           av1_dist_wtd_comp_weight_assign(
-              &cpi->common, mbmi, 0, &inter_pred_params.conv_params.fwd_offset,
+              &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset,
               &inter_pred_params.conv_params.bck_offset,
               &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
           int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
-          memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask));
+          memset(xd->seg_mask, mask_value,
+                 sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
           tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
                                                               bsize, this_mode);
         }
@@ -1369,7 +1373,7 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
       int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
       int best_rs2 = 0;
       int best_rate_mv = *rate_mv;
-      const int wedge_mask_size = get_wedge_types_lookup(bsize);
+      int wedge_mask_size = get_wedge_types_lookup(bsize);
       int need_mask_search = args->wedge_index == -1;
 
       if (need_mask_search && !have_newmv_in_inter_mode(this_mode)) {
@@ -1392,7 +1396,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
           mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
           if (mode_rd >= ref_best_rd / 2) continue;
 
-          if (have_newmv_in_inter_mode(this_mode)) {
+          if (have_newmv_in_inter_mode(this_mode) &&
+              !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search) {
             tmp_rate_mv = av1_interinter_compound_motion_search(
                 cpi, x, cur_mv, bsize, this_mode);
             av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
@@ -1425,6 +1430,33 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
             best_rs2 = rs2;
           }
         }
+        // Consider the asymmetric partitions for oblique angle only if the
+        // corresponding symmetric partition is the best so far.
+        // Note: For horizontal and vertical types, both symmetric and
+        // asymmetric partitions are always considered.
+        if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) {
+          // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16]
+          // correspond to symmetric partitions of the 4 oblique angles, the
+          // next 4 entries correspond to the vertical/horizontal
+          // symmetric/asymmetric partitions and the last 8 entries correspond
+          // to the asymmetric partitions of oblique types.
+          const int idx_before_asym_oblique = 7;
+          const int last_oblique_sym_idx = 3;
+          if (wedge_mask == idx_before_asym_oblique) {
+            if (best_mask_index > last_oblique_sym_idx) {
+              break;
+            } else {
+              // Asymmetric (Index-1) map for the corresponding oblique masks.
+              // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9
+              // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13
+              // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15
+              // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11
+              const int asym_mask_idx[4] = { 7, 11, 13, 9 };
+              wedge_mask = asym_mask_idx[best_mask_index];
+              wedge_mask_size = wedge_mask + 3;
+            }
+          }
+        }
       }
 
       if (need_mask_search) {
@@ -1439,7 +1471,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
         rs2 = masked_type_cost[cur_type];
         rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
 
-        if (have_newmv_in_inter_mode(this_mode)) {
+        if (have_newmv_in_inter_mode(this_mode) &&
+            !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search) {
           tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
                                                               bsize, this_mode);
         }
@@ -1485,7 +1518,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
         if (have_newmv_in_inter_mode(this_mode)) {
           // hard coded number for diff wtd
           int mask_value = mask_index == 0 ? 38 : 26;
-          memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask));
+          memset(xd->seg_mask, mask_value,
+                 sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
           tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
                                                               bsize, this_mode);
         }
@@ -1522,7 +1556,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
         rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
 
         int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26;
-        memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask));
+        memset(xd->seg_mask, mask_value,
+               sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
 
         if (have_newmv_in_inter_mode(this_mode)) {
           tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
diff --git a/third_party/libaom/source/libaom/av1/encoder/context_tree.c b/third_party/libaom/source/libaom/av1/encoder/context_tree.c
index 566576e4f5..9fd9d1b1e8 100644
--- a/third_party/libaom/source/libaom/av1/encoder/context_tree.c
+++ b/third_party/libaom/source/libaom/av1/encoder/context_tree.c
@@ -230,7 +230,7 @@ static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
 void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
   AV1_COMMON *const cm = &cpi->common;
   const int stat_generation_stage = is_stat_generation_stage(cpi);
-  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
   const int tree_nodes =
       get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
   int sms_tree_index = 0;
diff --git a/third_party/libaom/source/libaom/av1/encoder/dwt.c b/third_party/libaom/source/libaom/av1/encoder/dwt.c
index b5ed4a3446..5dfbcb677b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/dwt.c
+++ b/third_party/libaom/source/libaom/av1/encoder/dwt.c
@@ -147,9 +147,23 @@ uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
   return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
 }
 
-int av1_haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd) {
+static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
+                                       int hbd) {
   tran_low_t output[64];
 
   av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
   return av1_haar_ac_sad(output, 8, 8, 8);
 }
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols) {
+  int64_t wavelet_energy = 0;
+  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+      wavelet_energy += haar_ac_sad_8x8_uint8_input(
+          input + c8 * 8 + r8 * 8 * stride, stride, hbd);
+    }
+  }
+  return wavelet_energy;
+}
diff --git a/third_party/libaom/source/libaom/av1/encoder/dwt.h b/third_party/libaom/source/libaom/av1/encoder/dwt.h
index 1bd32edb3b..443b6bc12c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/dwt.h
+++ b/third_party/libaom/source/libaom/av1/encoder/dwt.h
@@ -19,6 +19,9 @@
 
 void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
                                int stride, int hbd);
-int av1_haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd);
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols);
 
 #endif  // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/libaom/source/libaom/av1/encoder/enc_enums.h b/third_party/libaom/source/libaom/av1/encoder/enc_enums.h
index 319e5d02c9..20cefa16a5 100644
--- a/third_party/libaom/source/libaom/av1/encoder/enc_enums.h
+++ b/third_party/libaom/source/libaom/av1/encoder/enc_enums.h
@@ -216,6 +216,8 @@ enum {
   NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
   THR_MODE_START = THR_NEARESTMV,
   THR_MODE_END = MAX_MODES,
+  THR_INTER_MODE_START = THR_MODE_START,
+  THR_INTER_MODE_END = THR_DC,
   THR_INVALID = 255
 } UENUM1BYTE(THR_MODES);
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c
index da7ec4487d..01f2959d85 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c
@@ -106,11 +106,19 @@ void av1_configure_buffer_updates(
   }
 
   if (ext_refresh_frame_flags->update_pending &&
-      (!is_stat_generation_stage(cpi)))
+      (!is_stat_generation_stage(cpi))) {
     set_refresh_frame_flags(refresh_frame_flags,
                             ext_refresh_frame_flags->golden_frame,
                             ext_refresh_frame_flags->bwd_ref_frame,
                             ext_refresh_frame_flags->alt_ref_frame);
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    if (ext_refresh_frame_flags->golden_frame)
+      gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE;
+    if (ext_refresh_frame_flags->alt_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE;
+    if (ext_refresh_frame_flags->bwd_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE;
+  }
 
   if (force_refresh_all)
     set_refresh_frame_flags(refresh_frame_flags, true, true, true);
@@ -141,7 +149,7 @@ static INLINE int is_frame_droppable(
     const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
   // Droppable frame is only used by external refresh flags. VoD setting won't
   // trigger its use case.
-  if (svc->external_ref_frame_config)
+  if (svc->set_ref_frame_config)
     return svc->non_reference_frame;
   else if (ext_refresh_frame_flags->update_pending)
     return !(ext_refresh_frame_flags->alt_ref_frame ||
@@ -168,7 +176,7 @@ static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
 
 static INLINE void update_gf_group_index(AV1_COMP *cpi) {
   // Increment the gf group index ready for the next frame.
-  ++cpi->gf_group.index;
+  ++cpi->gf_frame_index;
 }
 
 static void update_rc_counts(AV1_COMP *cpi) {
@@ -216,7 +224,7 @@ static int get_current_frame_ref_type(
   // TODO(jingning): This table should be a lot simpler with the new
   // ARF system in place. Keep frame_params for the time being as we are
   // still evaluating a few design options.
-  switch (cpi->gf_group.layer_depth[cpi->gf_group.index]) {
+  switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) {
     case 0: return 0;
     case 1: return 1;
     case MAX_ARF_LAYERS:
@@ -238,16 +246,16 @@ static int choose_primary_ref_frame(
 
   // In large scale case, always use Last frame's frame contexts.
   // Note(yunqing): In other cases, primary_ref_frame is chosen based on
-  // cpi->gf_group.layer_depth[cpi->gf_group.index], which also controls
+  // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
   // frame bit allocation.
   if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
 
-  if (cpi->use_svc) return av1_svc_primary_ref_frame(cpi);
+  if (cpi->ppi->use_svc) return av1_svc_primary_ref_frame(cpi);
 
   // Find the most recent reference frame with the same reference type as the
   // current frame
   const int current_ref_type = get_current_frame_ref_type(cpi, frame_params);
-  int wanted_fb = cpi->fb_of_context_type[current_ref_type];
+  int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type];
 
   int primary_ref_frame = PRIMARY_REF_NONE;
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
@@ -303,7 +311,7 @@ static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
   // Clear down mmx registers
   aom_clear_system_state();
 
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+  if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id > 0) {
     cpi->framerate = cpi->svc.base_framerate;
     av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
     return;
@@ -372,17 +380,17 @@ static struct lookahead_entry *choose_frame_source(
     struct lookahead_entry **last_source,
     EncodeFrameParams *const frame_params) {
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   struct lookahead_entry *source = NULL;
 
   // Source index in lookahead buffer.
-  int src_index = gf_group->arf_src_offset[gf_group->index];
+  int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
 
   // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
   if (src_index &&
       (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index,
                                   cpi->compressor_stage) != -1) &&
-      cpi->oxcf.rc_cfg.mode != AOM_Q) {
+      cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) {
     src_index = 0;
     *flush = 1;
   }
@@ -395,7 +403,7 @@ static struct lookahead_entry *choose_frame_source(
   // If this is a key frame and keyframe filtering is enabled with overlay,
   // then do not pop.
   if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 &&
-      gf_group->update_type[gf_group->index] == ARF_UPDATE &&
+      gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE &&
       !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) {
     if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz &&
         (*flush ||
@@ -404,16 +412,37 @@ static struct lookahead_entry *choose_frame_source(
       *pop_lookahead = 0;
     }
   }
+
+  // LAP stage does not have ARFs or forward key-frames,
+  // hence, always pop_lookahead here.
+  if (is_stat_generation_stage(cpi)) {
+    *pop_lookahead = 1;
+    src_index = 0;
+  }
+
   frame_params->show_frame = *pop_lookahead;
-  if (*pop_lookahead) {
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Future frame in parallel encode set
+  if (gf_group->src_offset[cpi->gf_frame_index] != 0 &&
+      !is_stat_generation_stage(cpi) &&
+      0 /*will be turned on along with frame parallel encode*/) {
+    src_index = gf_group->src_offset[cpi->gf_frame_index];
+    // Don't remove future frames from lookahead_ctx. They will be
+    // removed in their actual encode call.
+    *pop_lookahead = 0;
+  }
+#endif
+  if (frame_params->show_frame) {
     // show frame, pop from buffer
     // Get last frame source.
     if (cm->current_frame.frame_number > 0) {
-      *last_source =
-          av1_lookahead_peek(cpi->ppi->lookahead, -1, cpi->compressor_stage);
+      *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1,
+                                        cpi->compressor_stage);
     }
     // Read in the source frame.
-    source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+    source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+                                cpi->compressor_stage);
   } else {
     // no show frames are arf frames
     source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
@@ -677,7 +706,17 @@ void av1_update_ref_frame_map(AV1_COMP *cpi,
   return;
 }
 
-static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) {
+static int get_free_ref_map_index(
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    RefFrameMapPair ref_map_pairs[REF_FRAMES],
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    const RefBufferStack *ref_buffer_stack) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  (void)ref_buffer_stack;
+  for (int idx = 0; idx < REF_FRAMES; ++idx)
+    if (ref_map_pairs[idx].disp_order == -1) return idx;
+  return INVALID_IDX;
+#else
   for (int idx = 0; idx < REF_FRAMES; ++idx) {
     int is_free = 1;
     for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) {
@@ -704,11 +743,61 @@ static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) {
     if (is_free) return idx;
   }
   return INVALID_IDX;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                           int update_arf, int cur_frame_disp) {
+  int arf_count = 0;
+  int oldest_arf_order = INT32_MAX;
+  int oldest_arf_idx = -1;
+
+  int oldest_frame_order = INT32_MAX;
+  int oldest_idx = -1;
+
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    const int reference_frame_level = ref_pair.pyr_level;
+    // Do not refresh a future frame.
+    if (frame_order > cur_frame_disp) continue;
+
+    // Keep track of the oldest level 1 frame if the current frame is also level
+    // 1.
+    if (reference_frame_level == 1) {
+      // If there are more than 2 level 1 frames in the reference list,
+      // discard the oldest.
+      if (frame_order < oldest_arf_order) {
+        oldest_arf_order = frame_order;
+        oldest_arf_idx = map_idx;
+      }
+      arf_count++;
+      continue;
+    }
+
+    // Update the overall oldest reference frame.
+    if (frame_order < oldest_frame_order) {
+      oldest_frame_order = frame_order;
+      oldest_idx = map_idx;
+    }
+  }
+  if (update_arf && arf_count > 2) return oldest_arf_idx;
+  if (oldest_idx >= 0) return oldest_idx;
+  if (oldest_arf_idx >= 0) return oldest_arf_idx;
+  assert(0 && "No valid refresh index found");
+  return -1;
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,
                                 FRAME_UPDATE_TYPE frame_update_type,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                int cur_disp_order,
+                                RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
                                 const RefBufferStack *const ref_buffer_stack) {
   const AV1_COMMON *const cm = &cpi->common;
   const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
@@ -733,7 +822,7 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
   int refresh_mask = 0;
 
   if (ext_refresh_frame_flags->update_pending) {
-    if (svc->external_ref_frame_config) {
+    if (svc->set_ref_frame_config) {
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
         int ref_frame_map_idx = svc->ref_idx[i];
         refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
@@ -777,7 +866,30 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
   }
 
   // Search for the open slot to store the current frame.
-  int free_fb_index = get_free_ref_map_index(ref_buffer_stack);
+  int free_fb_index = get_free_ref_map_index(
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      ref_buffer_stack);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // No refresh necessary for these frame types.
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE)
+    return refresh_mask;
+
+  // If there is an open slot, refresh that one instead of replacing a
+  // reference.
+  if (free_fb_index != INVALID_IDX) {
+    refresh_mask = 1 << free_fb_index;
+    return refresh_mask;
+  }
+
+  const int update_arf = frame_update_type == ARF_UPDATE;
+  const int refresh_idx =
+      get_refresh_idx(ref_frame_map_pairs, update_arf, cur_disp_order);
+  return 1 << refresh_idx;
+#else
   switch (frame_update_type) {
     case KF_UPDATE:
     case GF_UPDATE:
@@ -843,6 +955,7 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
   }
 
   return refresh_mask;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 }
 
 #if !CONFIG_REALTIME_ONLY
@@ -852,10 +965,10 @@ void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size);
+  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
 
   set_mi_offsets(&cm->mi_params, xd, 0, 0);
 }
@@ -872,8 +985,9 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
 #endif
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
   // Decide whether to apply temporal filtering to the source frame.
   int apply_filtering = 0;
@@ -887,7 +1001,7 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
         oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
     if (allow_kf_filtering) {
       const double y_noise_level = av1_estimate_noise_from_single_plane(
-          frame_input->source, 0, cm->seq_params.bit_depth);
+          frame_input->source, 0, cm->seq_params->bit_depth);
       apply_filtering = y_noise_level > 0;
     } else {
       apply_filtering = 0;
@@ -900,6 +1014,9 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
     // ARF
     apply_filtering = oxcf->algo_cfg.arnr_max_frames > 0;
   }
+  if (is_stat_generation_stage(cpi)) {
+    apply_filtering = 0;
+  }
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
@@ -911,7 +1028,7 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
     int show_existing_alt_ref = 0;
     // TODO(bohanli): figure out why we need frame_type in cm here.
     cm->current_frame.frame_type = frame_params->frame_type;
-    int arf_src_index = gf_group->arf_src_offset[gf_group->index];
+    int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
     int is_forward_keyframe = 0;
     if (!frame_params->show_frame && cpi->no_show_fwd_kf) {
       // TODO(angiebird): Figure out why this condition yields forward keyframe.
@@ -922,8 +1039,8 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
         av1_temporal_filter(cpi, arf_src_index, update_type,
                             is_forward_keyframe, &show_existing_alt_ref);
     if (code_arf) {
-      aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-      frame_input->source = &cpi->alt_ref_buffer;
+      aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer, av1_num_planes(cm));
+      frame_input->source = &cpi->ppi->alt_ref_buffer;
       aom_copy_metadata_to_frame_buffer(frame_input->source,
                                         source_buffer->metadata);
     }
@@ -944,12 +1061,12 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
     // Don't do tpl for fwd key frames or fwd key frame overlays
     allow_tpl = allow_tpl && !cpi->sf.tpl_sf.disable_filtered_key_tpl &&
                 !cpi->no_show_fwd_kf &&
-                gf_group->update_type[gf_group->index] != OVERLAY_UPDATE;
+                gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE;
   } else {
     // Do tpl after ARF is filtered, or if no ARF, at the second frame of GF
     // group.
     // TODO(bohanli): if no ARF, just do it at the first frame.
-    int gf_index = gf_group->index;
+    int gf_index = cpi->gf_frame_index;
     allow_tpl = allow_tpl && (gf_group->update_type[gf_index] == ARF_UPDATE ||
                               gf_group->update_type[gf_index] == GF_UPDATE);
     if (allow_tpl) {
@@ -962,10 +1079,13 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
 
   if (allow_tpl == 0) {
     // Avoid the use of unintended TPL stats from previous GOP's results.
-    if (gf_group->index == 0) av1_init_tpl_stats(&cpi->tpl_data);
+    if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi))
+      av1_init_tpl_stats(&cpi->ppi->tpl_data);
   } else {
-    if (!cpi->tpl_data.skip_tpl_setup_stats)
+    if (!cpi->skip_tpl_setup_stats) {
+      av1_tpl_preload_rc_estimate(cpi, frame_params);
       av1_tpl_setup_stats(cpi, 0, frame_params, frame_input);
+    }
   }
 
   if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
@@ -1003,12 +1123,262 @@ static INLINE int find_unused_ref_frame(const int *used_ref_frames,
   return INVALID_IDX;
 }
 
-void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+/*!\cond */
+// Struct to keep track of relevant reference frame data.
+typedef struct {
+  int map_idx;
+  int disp_order;
+  int pyr_level;
+  int used;
+} RefBufMapData;
+/*!\endcond */
+
+// Comparison function to sort reference frames in ascending display order.
+static int compare_map_idx_pair_asc(const void *a, const void *b) {
+  if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) {
+    return 0;
+  } else if (((const RefBufMapData *)a)->disp_order >
+             ((const RefBufMapData *)b)->disp_order) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+// Checks to see if a particular reference frame is already in the reference
+// frame map.
+static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) {
+  for (int i = 0; i < n_frames; i++) {
+    if (disp_order == map[i].disp_order) return 1;
+  }
+  return 0;
+}
+
+// Add a reference buffer index to a named reference slot.
+static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx,
+                            int frame) {
+  remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx;
+  ref->used = 1;
+}
+
+// Threshold dictating when we are allowed to start considering
+// leaving lowest level frames unmapped.
+#define LOW_LEVEL_FRAMES_TR 5
+
+// Find which reference buffer should be left out of the named mapping.
+// This is because there are 8 reference buffers and only 7 named slots.
+static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs,
+                             int n_min_level_refs, int min_level,
+                             int cur_frame_disp) {
+  int max_dist = 0;
+  int unmapped_idx = -1;
+  if (n_bufs <= ALTREF_FRAME) return;
+  for (int i = 0; i < n_bufs; i++) {
+    if (buffer_map[i].used) continue;
+    if (buffer_map[i].pyr_level != min_level ||
+        n_min_level_refs >= LOW_LEVEL_FRAMES_TR) {
+      int dist = abs(cur_frame_disp - buffer_map[i].disp_order);
+      if (dist > max_dist) {
+        max_dist = dist;
+        unmapped_idx = i;
+      }
+    }
+  }
+  assert(unmapped_idx >= 0 && "Unmapped reference not found");
+  buffer_map[unmapped_idx].used = 1;
+}
+
+static void get_ref_frames(AV1_COMP *const cpi,
+                           RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                           int cur_frame_disp) {
   AV1_COMMON *cm = &cpi->common;
   int *const remapped_ref_idx = cm->remapped_ref_idx;
-  int *const arf_stack = ref_buffer_stack->arf_stack;
-  int *const lst_stack = ref_buffer_stack->lst_stack;
-  int *const gld_stack = ref_buffer_stack->gld_stack;
+
+  int buf_map_idx = 0;
+
+  // Initialize reference frame mappings.
+  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+  RefBufMapData buffer_map[REF_FRAMES];
+  int n_bufs = 0;
+  memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0]));
+  int min_level = MAX_ARF_LAYERS;
+  int max_level = 0;
+
+  // Go through current reference buffers and store display order, pyr level,
+  // and map index.
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    // Avoid duplicates.
+    if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue;
+    const int reference_frame_level = ref_pair.pyr_level;
+
+    // Keep track of the lowest and highest levels that currently exist.
+    if (reference_frame_level < min_level) min_level = reference_frame_level;
+    if (reference_frame_level > max_level) max_level = reference_frame_level;
+
+    buffer_map[n_bufs].map_idx = map_idx;
+    buffer_map[n_bufs].disp_order = frame_order;
+    buffer_map[n_bufs].pyr_level = reference_frame_level;
+    buffer_map[n_bufs].used = 0;
+    n_bufs++;
+  }
+
+  // Sort frames in ascending display order.
+  qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc);
+
+  int n_min_level_refs = 0;
+  int n_past_high_level = 0;
+  int closest_past_ref = -1;
+  int golden_idx = -1;
+  int altref_idx = -1;
+
+  // Find the GOLDEN_FRAME and BWDREF_FRAME.
+  // Also collect various stats about the reference frames for the remaining
+  // mappings.
+  for (int i = n_bufs - 1; i >= 0; i--) {
+    if (buffer_map[i].pyr_level == min_level) {
+      // Keep track of the number of lowest level frames.
+      n_min_level_refs++;
+      if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 &&
+          remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for GOLDEN.
+        golden_idx = i;
+      } else if (buffer_map[i].disp_order > cur_frame_disp &&
+                 altref_idx == -1 &&
+                 remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for ALTREF.
+        altref_idx = i;
+      }
+    } else if (buffer_map[i].disp_order == cur_frame_disp) {
+      // Map the BWDREF_FRAME if this is the show_existing_frame.
+      add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME);
+    }
+
+    // Keep track of the number of past frames that are not at the lowest level.
+    if (buffer_map[i].disp_order < cur_frame_disp &&
+        buffer_map[i].pyr_level != min_level)
+      n_past_high_level++;
+
+    // Keep track of where the frames change from being past frames to future
+    // frames.
+    if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0)
+      closest_past_ref = i;
+  }
+
+  // Do not map GOLDEN and ALTREF based on their pyramid level if all reference
+  // frames have the same level.
+  if (n_min_level_refs <= n_bufs) {
+    // Map the GOLDEN_FRAME.
+    if (golden_idx > -1)
+      add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME);
+    // Map the ALTREF_FRAME.
+    if (altref_idx > -1)
+      add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME);
+  }
+
+  // Find the buffer to be excluded from the mapping.
+  set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level,
+                   cur_frame_disp);
+
+  // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME.
+  for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in decreasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MIN;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order < cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order > next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME.
+  for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in increasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MAX;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order > cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order < next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining past frames.
+  buf_map_idx = closest_past_ref;
+  for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining future frames.
+  buf_map_idx = n_bufs - 1;
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx > closest_past_ref; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Fill any slots that are empty (should only happen for the first 7 frames).
+  for (int i = 0; i < REF_FRAMES; ++i)
+    if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0;
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+void av1_get_ref_frames(const RefBufferStack *ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                        AV1_COMP *cpi,
+                        RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                        int remapped_ref_idx[REF_FRAMES]) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  (void)ref_buffer_stack;
+  (void)remapped_ref_idx;
+  get_ref_frames(cpi, ref_frame_map_pairs, cur_frame_disp);
+  return;
+#else
+  const int *const arf_stack = ref_buffer_stack->arf_stack;
+  const int *const lst_stack = ref_buffer_stack->lst_stack;
+  const int *const gld_stack = ref_buffer_stack->gld_stack;
   const int arf_stack_size = ref_buffer_stack->arf_stack_size;
   const int lst_stack_size = ref_buffer_stack->lst_stack_size;
   const int gld_stack_size = ref_buffer_stack->gld_stack_size;
@@ -1079,6 +1449,7 @@ void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) {
       remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
     }
   }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 }
 
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
@@ -1088,7 +1459,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         int flush) {
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
   GFConfig *const gf_cfg = &oxcf->gf_cfg;
 
@@ -1112,9 +1483,9 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
 
   if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) {
 #if !CONFIG_REALTIME_ONLY
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == 1 && !cpi->ppi->twopass.first_pass_done) {
       av1_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
+      cpi->ppi->twopass.first_pass_done = 1;
     }
 #endif
     return -1;
@@ -1128,11 +1499,9 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
         AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
   }
 
-  cpi->tpl_data.skip_tpl_setup_stats = 0;
+  cpi->skip_tpl_setup_stats = 0;
 #if !CONFIG_REALTIME_ONLY
-  const int use_one_pass_rt_params = has_no_stats_stage(cpi) &&
-                                     oxcf->mode == REALTIME &&
-                                     gf_cfg->lag_in_frames == 0;
+  const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
   if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, av1_get_second_pass_params_time);
@@ -1148,19 +1517,19 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     // If this is a forward keyframe, mark as a show_existing_frame
     // TODO(bohanli): find a consistent condition for fwd keyframes
     if (oxcf->kf_cfg.fwd_kf_enabled &&
-        gf_group->update_type[gf_group->index] == OVERLAY_UPDATE &&
+        gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
         cpi->rc.frames_to_key == 0) {
       frame_params.show_existing_frame = 1;
     } else {
       frame_params.show_existing_frame =
           (cpi->show_existing_alt_ref &&
-           gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) ||
-          gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+           gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) ||
+          gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE;
     }
     frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
 
     // Reset show_existing_alt_ref decision to 0 after it is used.
-    if (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) {
+    if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
       cpi->show_existing_alt_ref = 0;
     }
   } else {
@@ -1181,13 +1550,20 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
 
   if (source == NULL) {  // If no source was found, we can't encode a frame.
 #if !CONFIG_REALTIME_ONLY
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == 1 && !cpi->ppi->twopass.first_pass_done) {
       av1_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
+      cpi->ppi->twopass.first_pass_done = 1;
     }
 #endif
     return -1;
   }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // reset src_offset to allow actual encode call for this frame to get its
+  // source.
+  gf_group->src_offset[cpi->gf_frame_index] = 0;
+#endif
+
   // Source may be changed if temporal filtered later.
   frame_input.source = &source->img;
   frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
@@ -1216,7 +1592,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
           &cm->film_grain_params);
     } else {
       cm->cur_frame->film_grain_params_present =
-          cm->seq_params.film_grain_params_present;
+          cm->seq_params->film_grain_params_present;
     }
     // only one operating point supported now
     const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
@@ -1226,19 +1602,20 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
 
 #if CONFIG_REALTIME_ONLY
   av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
-  if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 &&
-      cm->number_temporal_layers == 1)
-    av1_set_reference_structure_one_pass_rt(cpi, gf_group->index == 0);
+  if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
+      cpi->ppi->number_temporal_layers == 1)
+    av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
 #else
   if (use_one_pass_rt_params) {
     av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
-    if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 &&
-        cm->number_temporal_layers == 1)
-      av1_set_reference_structure_one_pass_rt(cpi, gf_group->index == 0);
+    if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
+        cpi->ppi->number_temporal_layers == 1)
+      av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
   }
 #endif
 
-  FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group);
+  FRAME_UPDATE_TYPE frame_update_type =
+      get_frame_update_type(gf_group, cpi->gf_frame_index);
 
   if (frame_params.show_existing_frame &&
       frame_params.frame_type != KEY_FRAME) {
@@ -1302,9 +1679,21 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
     const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+    init_ref_map_pair(cpi, ref_frame_map_pairs);
+    const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+    const int cur_frame_disp =
+        cpi->common.current_frame.frame_number + order_offset;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     if (!ext_flags->refresh_frame.update_pending) {
-      av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
-    } else if (cpi->svc.external_ref_frame_config) {
+      av1_get_ref_frames(&cpi->ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                         cpi, ref_frame_map_pairs, cur_frame_disp,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                         cm->remapped_ref_idx);
+    } else if (cpi->svc.set_ref_frame_config) {
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
         cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
     }
@@ -1319,19 +1708,54 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     frame_params.ref_frame_flags = get_ref_frame_flags(
         &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE.
+    if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) {
+      frame_params.primary_ref_frame = PRIMARY_REF_NONE;
+    } else {
+      frame_params.primary_ref_frame =
+          choose_primary_ref_frame(cpi, &frame_params);
+    }
+#else
     frame_params.primary_ref_frame =
         choose_primary_ref_frame(cpi, &frame_params);
-    frame_params.order_offset = gf_group->arf_src_offset[gf_group->index];
-
-    frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack);
-
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+    frame_params.refresh_frame_flags =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                    cur_frame_disp, ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                                    &cpi->ref_buffer_stack);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (gf_group->is_frame_non_ref[cpi->gf_frame_index])
+      frame_params.refresh_frame_flags = 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    frame_params.existing_fb_idx_to_show = INVALID_IDX;
+    // Find the frame buffer to show based on display order.
+    if (frame_params.show_existing_frame) {
+      for (int frame = 0; frame < REF_FRAMES; frame++) {
+        const RefCntBuffer *const buf = cm->ref_frame_map[frame];
+        if (buf == NULL) continue;
+        const int frame_order = (int)buf->display_order_hint;
+        if (frame_order == cur_frame_disp)
+          frame_params.existing_fb_idx_to_show = frame;
+      }
+    }
+#else
     frame_params.existing_fb_idx_to_show =
         frame_params.show_existing_frame
             ? (frame_update_type == INTNL_OVERLAY_UPDATE
                    ? get_ref_frame_map_idx(cm, BWDREF_FRAME)
                    : get_ref_frame_map_idx(cm, ALTREF_FRAME))
             : INVALID_IDX;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 
   // The way frame_params->remapped_ref_idx is setup is a placeholder.
@@ -1351,6 +1775,12 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
   }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Copy previous frame's largest MV component from ppi to cpi.
+  if (!is_stat_generation_stage(cpi) && cpi->do_frame_data_update)
+    cpi->mv_search_params.max_mv_magnitude = cpi->ppi->max_mv_magnitude;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 #if CONFIG_REALTIME_ONLY
   if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
       AOM_CODEC_OK) {
@@ -1369,10 +1799,17 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   }
 #endif  // CONFIG_REALTIME_ONLY
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Store current frame's largest MV component in ppi.
+  if (!is_stat_generation_stage(cpi) && cpi->do_frame_data_update)
+    cpi->ppi->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
+#endif
+
   if (!is_stat_generation_stage(cpi)) {
     // First pass doesn't modify reference buffer assignment or produce frame
     // flags
     update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     if (!ext_flags->refresh_frame.update_pending) {
       int ref_map_index =
           av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
@@ -1380,6 +1817,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                                cm->show_existing_frame, ref_map_index,
                                &cpi->ref_buffer_stack);
     }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 
 #if !CONFIG_REALTIME_ONLY
@@ -1408,7 +1846,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   }
 
   if (!is_stat_generation_stage(cpi)) {
-    update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
+    update_fb_of_context_type(cpi, &frame_params, cpi->ppi->fb_of_context_type);
     set_additional_frame_flags(cm, frame_flags);
     update_rc_counts(cpi);
   }
@@ -1421,7 +1859,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     cpi->droppable = is_frame_droppable(&cpi->svc, &ext_flags->refresh_frame);
   }
 
-  if (cpi->use_svc) av1_save_layer_context(cpi);
+  if (cpi->ppi->use_svc) av1_save_layer_context(cpi);
 
   return AOM_CODEC_OK;
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h
index 351e8a1328..c7b75c8430 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h
@@ -69,6 +69,10 @@ void av1_configure_buffer_updates(
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,
                                 FRAME_UPDATE_TYPE frame_update_type,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                int cur_disp_order,
+                                RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
                                 const RefBufferStack *const ref_buffer_stack);
 
 int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
@@ -79,7 +83,25 @@ void av1_update_ref_frame_map(AV1_COMP *cpi,
                               int ref_map_index,
                               RefBufferStack *ref_buffer_stack);
 
-void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
+/*!\brief Obtain indices of reference frames from reference frame buffer stacks
+ *
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    ref_buffer_stack  Data structure for reference frame buffer
+ *                                 stacks.
+ * \param[out]   remapped_ref_idx  An array for storing indices of reference
+ *                                 frames. The index is used to retrieve a
+ *                                 reference frame buffer from ref_frame_map
+ *                                 in AV1Common.
+ */
+void av1_get_ref_frames(const RefBufferStack *ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                        AV1_COMP *cpi,
+                        RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                        int remapped_ref_idx[REF_FRAMES]);
 
 int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const int up_to_index,
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodeframe.c b/third_party/libaom/source/libaom/av1/encoder/encodeframe.c
index 24d3488245..b3f836b481 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodeframe.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encodeframe.c
@@ -55,6 +55,7 @@
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/ml.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
@@ -150,7 +151,7 @@ unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+      cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -163,9 +164,9 @@ unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
   const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
                                        AV1_HIGH_VAR_OFFS_10,
                                        AV1_HIGH_VAR_OFFS_12 };
-  var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                         CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse);
+  var = cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0,
+                                &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -181,7 +182,8 @@ static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
   assert(last != NULL);
   last_y =
       &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  var = cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride,
+                                &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -242,7 +244,7 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   assert(delta_q_info->delta_q_present_flag);
 
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   // Delta-q modulation based on variance
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
 
@@ -307,7 +309,7 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
         (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
     const int frame_lf_count =
         av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-    const int mib_size = cm->seq_params.mib_size;
+    const int mib_size = cm->seq_params->mib_size;
 
     // pre-set the delta lf for loop filter. Note that this value is set
     // before mi is assigned for each block in current superblock
@@ -326,22 +328,23 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
 static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
                                  int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCK *x = &td->mb;
-  const int frame_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const int frame_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
   av1_zero(x->tpl_keep_ref_frame);
 
-  if (tpl_frame->is_valid == 0) return;
-  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return;
   if (frame_idx >= MAX_TPL_FRAME_IDX) return;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  if (!tpl_frame->is_valid) return;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
 
-  const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+  const int is_overlay =
+      cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
   if (is_overlay) {
     memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
     return;
@@ -351,7 +354,7 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
   const int tpl_stride = tpl_frame->stride;
   int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
   const int step = 1 << block_mis_log2;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   const int mi_row_end =
       AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
@@ -426,15 +429,15 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
 
 static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
                                                int mi_row, int mi_col) {
-  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
   const int orig_rdmult = cpi->rd.RDMULT;
 
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int gf_group_index = cpi->gf_group.index;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int gf_group_index = cpi->gf_frame_index;
   if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
       cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
-      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+      cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
     const int dr =
         av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
     x->rdmult = dr;
@@ -451,7 +454,7 @@ static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
   MACROBLOCKD *xd = &x->e_mbd;
 
   // TODO(kyslov) Extend to 128x128
-  assert(cm->seq_params.sb_size == BLOCK_64X64);
+  assert(cm->seq_params->sb_size == BLOCK_64X64);
 
   av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
@@ -512,7 +515,7 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
   const TileInfo *const tile_info = &tile_data->tile_info;
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   // Grade the temporal variation of the sb, the grade will be used to decide
   // fast mode search strategy for coding blocks
@@ -557,6 +560,20 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
          sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
   set_cb_offsets(td->mb.cb_offset, 0, 0);
 
+  // Initialize the flag to skip cdef for 64x64 blocks: if color sensitivy is
+  // on, set to 0 (don't skip).
+  if (sf->rt_sf.skip_cdef_sb) {
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi[idx_in_sb])
+          mi[idx_in_sb]->skip_cdef_curr_sb =
+              !(x->color_sensitivity_sb[0] || x->color_sensitivity_sb[1]);
+      }
+    }
+  }
   // Adjust and encode the superblock
   PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
   av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
@@ -599,7 +616,7 @@ static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
     if (gather_tpl_data) {
       if (cm->delta_q_info.delta_q_present_flag) {
         const int num_planes = av1_num_planes(cm);
-        const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+        const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
         setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
         av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
       }
@@ -637,7 +654,7 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
   const TileInfo *const tile_info = &tile_data->tile_info;
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   const int num_planes = av1_num_planes(cm);
   int dummy_rate;
   int64_t dummy_dist;
@@ -708,10 +725,17 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
         cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
 
     if (num_passes == 1) {
+#if CONFIG_PARTITION_SEARCH_ORDER
+      av1_reset_part_sf(&cpi->sf.part_sf);
+      RD_STATS this_rdc;
+      av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row, mi_col,
+                              sb_size, &this_rdc);
+#else
       PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
       av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
                             &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
                             SB_SINGLE_PASS, NULL);
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
     } else {
       // First pass
       SB_FIRST_PASS_STATS sb_fp_stats;
@@ -753,7 +777,8 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
 static AOM_INLINE int is_rtc_mode(const CostUpdateFreq *cost_upd_freq,
                                   int use_non_rd_mode) {
   return (use_non_rd_mode && cost_upd_freq->coeff >= 2 &&
-          cost_upd_freq->mode >= 2 && cost_upd_freq->mv >= 2);
+          cost_upd_freq->mode >= 2 && cost_upd_freq->mv >= 2 &&
+          cost_upd_freq->dv >= 2);
 }
 
 /*!\brief Encode a superblock row by breaking it into superblocks
@@ -776,9 +801,9 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  const int mib_size = cm->seq_params.mib_size;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int mib_size = cm->seq_params->mib_size;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
   const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
   const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
   const CostUpdateFreq *const cost_upd_freq = &cpi->oxcf.cost_upd_freq;
@@ -833,6 +858,8 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
     av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
 
     // Reset color coding related parameters
+    x->color_sensitivity_sb[0] = 0;
+    x->color_sensitivity_sb[1] = 0;
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
     x->content_state_sb.source_sad = kMedSad;
@@ -855,6 +882,12 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
+    // Produce the gradient data at superblock level, when intra mode pruning
+    // based on hog is enabled.
+    if (cpi->sf.intra_sf.intra_pruning_with_hog ||
+        cpi->sf.intra_sf.chroma_intra_pruning_with_hog)
+      produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col);
+
     // encode the superblock
     if (use_nonrd_mode) {
       encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
@@ -886,10 +919,10 @@ static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
 
   // Copy data over into macro block data structures.
   av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
-                       cm->seq_params.sb_size);
+                       cm->seq_params->sb_size);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
 }
 
 void av1_alloc_tile_data(AV1_COMP *cpi) {
@@ -927,13 +960,14 @@ void av1_init_tile_data(AV1_COMP *cpi) {
       TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
       tile_data->firstpass_top_mv = kZeroMv;
+      tile_data->abs_sum_level = 0;
 
       if (pre_tok != NULL && tplist != NULL) {
         token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
         pre_tok = token_info->tile_tok[tile_row][tile_col];
-        tile_tok = allocated_tokens(*tile_info,
-                                    cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
-                                    num_planes);
+        tile_tok = allocated_tokens(
+            *tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+            num_planes);
         token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
         tplist = token_info->tplist[tile_row][tile_col];
         tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
@@ -961,14 +995,14 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
   TokenExtra *tok = NULL;
   TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
   const int sb_row_in_tile =
-      (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
   const int tile_mb_cols =
       (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
   const int num_mb_rows_in_sb =
-      ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+      ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
 
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
-                cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+                cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes);
   assert(tplist != NULL);
   tplist[sb_row_in_tile].start = tok;
 
@@ -979,7 +1013,7 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
 
   assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
          get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
-                         cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
+                         cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
                          num_planes));
 
   (void)tile_mb_cols;
@@ -1005,7 +1039,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
                          &td->mb.e_mbd);
 
   if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
-    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
 
   if (td->mb.txfm_search_info.txb_rd_records != NULL) {
     av1_crc32c_calculator_init(
@@ -1013,9 +1047,10 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   }
 
   for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
   }
+  this_tile->abs_sum_level = td->abs_sum_level;
 }
 
 /*!\brief Break one frame into tiles and encode the tiles
@@ -1030,15 +1065,13 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
   const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
 
+  MACROBLOCK *const mb = &cpi->td.mb;
   assert(IMPLIES(cpi->tile_data == NULL,
                  cpi->allocated_tiles < tile_cols * tile_rows));
   if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
-  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
-    cpi->td.mb.txfm_search_info.txb_rd_records =
-        (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
-  }
+  av1_alloc_mb_data(cm, mb, cpi->sf.rt_sf.use_nonrd_pick_mode);
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -1046,6 +1079,7 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
           &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
       cpi->td.intrabc_used = 0;
       cpi->td.deltaq_used = 0;
+      cpi->td.abs_sum_level = 0;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
       // Reset cyclic refresh counters.
@@ -1062,10 +1096,7 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
     }
   }
 
-  if (cpi->td.mb.txfm_search_info.txb_rd_records) {
-    aom_free(cpi->td.mb.txfm_search_info.txb_rd_records);
-    cpi->td.mb.txfm_search_info.txb_rd_records = NULL;
-  }
+  av1_dealloc_mb_data(cm, mb);
 }
 
 // Set the relative distance of a reference frame w.r.t. current frame
@@ -1141,10 +1172,10 @@ static int check_skip_mode_enabled(AV1_COMP *const cpi) {
   const int cur_offset = (int)cm->current_frame.order_hint;
   int ref_offset[2];
   get_skip_mode_ref_offsets(cm, ref_offset);
-  const int cur_to_ref0 = get_relative_dist(&cm->seq_params.order_hint_info,
+  const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info,
                                             cur_offset, ref_offset[0]);
-  const int cur_to_ref1 = abs(get_relative_dist(&cm->seq_params.order_hint_info,
-                                                cur_offset, ref_offset[1]));
+  const int cur_to_ref1 = abs(get_relative_dist(
+      &cm->seq_params->order_hint_info, cur_offset, ref_offset[1]));
   if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
 
   // High Latency: Turn off skip mode if all refs are fwd.
@@ -1248,6 +1279,9 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
   FrameProbInfo *const frame_probs = &cpi->frame_probs;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
@@ -1278,9 +1312,15 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
 
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-    if (frame_probs->warped_probs[update_type] <
-        cpi->sf.inter_sf.prune_warped_prob_thresh)
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int warped_probability;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    warped_probability = temp_frame_probs->warped_probs[update_type];
+#else
+    warped_probability = frame_probs->warped_probs[update_type];
+#endif
+    if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh)
       features->allow_warped_motion = 0;
   }
 
@@ -1316,7 +1356,7 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
     // Hash data generated for screen contents is used for intraBC ME
     const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
     const int max_sb_size =
-        (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2));
+        (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
     int src_idx = 0;
     for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
       const int dst_idx = !src_idx;
@@ -1377,10 +1417,10 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
     // is used for ineligible frames. That effectively will turn off row_mt
     // usage. Note objective delta_q and tpl eligible frames are only altref
     // frames currently.
-    const GF_GROUP *gf_group = &cpi->gf_group;
+    const GF_GROUP *gf_group = &cpi->ppi->gf_group;
     if (cm->delta_q_info.delta_q_present_flag) {
       if (deltaq_mode == DELTA_Q_OBJECTIVE &&
-          !is_frame_tpl_eligible(gf_group, gf_group->index))
+          !is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
         cm->delta_q_info.delta_q_present_flag = 0;
     }
 
@@ -1500,8 +1540,8 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   features->tx_mode = select_tx_mode(cm, tx_search_type);
 
   if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     for (i = 0; i < TX_SIZES_ALL; i++) {
       int sum = 0;
       int j;
@@ -1519,13 +1559,33 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
         left -= prob;
         if (j == 0) prob += left;
         frame_probs->tx_type_probs[update_type][i][j] = prob;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        /* TODO(FPMT): The current update is happening in cpi->frame_probs,
+         * this need to be taken care appropriately in final FPMT implementation
+         * to carry these values to subsequent frames. The frame_probs update is
+         * accumulated across frames, so the values from all individual parallel
+         * frames need to be taken into account after all the parallel frames
+         * are encoded.
+         *
+         * Only for quality simulation purpose - Update the accumulated frame
+         * probabilities in ppi->temp_variable based on the update flag.
+         */
+        if (cpi->do_frame_data_update) {
+          for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+               update_type_idx++) {
+            temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+                frame_probs->tx_type_probs[update_type_idx][i][j];
+          }
+        }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       }
     }
   }
 
   if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
       cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
     for (i = 0; i < BLOCK_SIZES_ALL; i++) {
       int sum = 0;
@@ -1535,23 +1595,63 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
           sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
       frame_probs->obmc_probs[update_type][i] =
           (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      /* TODO(FPMT): The current update is happening in cpi->frame_probs,
+       * this need to be taken care appropriately in final FPMT
+       * implementation to carry these values to subsequent frames.
+       * The frame_probs update is accumulated across frames, so the
+       * values from all individual parallel frames need to be taken
+       * into account after all the parallel frames are encoded.
+       *
+       * Only for quality simulation purpose - Update the accumulated frame
+       * probabilities in ppi->temp_variable based on the update flag.
+       */
+      if (cpi->do_frame_data_update) {
+        for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+             update_type_idx++) {
+          temp_frame_probs->obmc_probs[update_type_idx][i] =
+              frame_probs->obmc_probs[update_type_idx][i];
+        }
+      }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     }
   }
 
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     int sum = 0;
     for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
     const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
     frame_probs->warped_probs[update_type] =
         (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    /* TODO(FPMT): The current update is happening in cpi->frame_probs,
+     * this need to be taken care appropriately in final FPMT
+     * implementation to carry these values to subsequent frames.
+     * The frame_probs update is accumulated across frames, so the
+     * values from all individual parallel frames need to be taken
+     * into account after all the parallel frames are encoded.
+     *
+     * Only for quality simulation purpose - Update the accumulated frame
+     * probabilities in ppi->temp_variable based on the update flag.
+     */
+    if (cpi->do_frame_data_update) {
+      for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+           update_type_idx++) {
+        temp_frame_probs->warped_probs[update_type_idx] =
+            frame_probs->warped_probs[update_type_idx];
+      }
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 
   if (cm->current_frame.frame_type != KEY_FRAME &&
       cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
       features->interp_filter == SWITCHABLE) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       int sum = 0;
@@ -1572,6 +1672,25 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
         left -= prob;
         if (j == 0) prob += left;
         frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        /* TODO(FPMT): The current update is happening in cpi->frame_probs,
+         * this need to be taken care appropriately in final FPMT
+         * implementation to carry these values to subsequent frames.
+         * The frame_probs update is accumulated across frames, so the
+         * values from all individual parallel frames need to be taken
+         * into account after all the parallel frames are encoded.
+         *
+         * Only for quality simulation purpose - Update the accumulated frame
+         * probabilities in ppi->temp_variable based on the update flag.
+         */
+        if (cpi->do_frame_data_update) {
+          for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+               update_type_idx++) {
+            temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+                frame_probs->switchable_interp_probs[update_type_idx][i][j];
+          }
+        }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       }
     }
   }
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c
index c10b2ffe6c..d3fa50292b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c
@@ -44,7 +44,6 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
 
   assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col / num_mi_h;
@@ -59,20 +58,19 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
   *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
   *rdmult = AOMMAX(*rdmult, 0);
   av1_set_error_per_bit(errorperbit, *rdmult);
-  aom_clear_system_state();
 }
 
 // Return the end column for the current superblock, in unit of TPL blocks.
 static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
                                          int num_mi_w) {
   // Find the start column of this superblock.
-  const int sb_mi_col_start = (mi_col >> cm->seq_params.mib_size_log2)
-                              << cm->seq_params.mib_size_log2;
+  const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
+                              << cm->seq_params->mib_size_log2;
   // Same but in superres upscaled dimension.
   const int sb_mi_col_start_sr =
       coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
   // Width of this superblock in mi units.
-  const int sb_mi_width = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
   // Same but in superres upscaled dimension.
   const int sb_mi_width_sr =
       coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
@@ -86,15 +84,16 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize, const int mi_row,
                             const int mi_col, int orig_rdmult) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
   const int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
-  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
-  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return deltaq_rdmult;
   if (tpl_idx >= MAX_TPL_FRAME_IDX) return deltaq_rdmult;
+  const TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+  if (!tpl_frame->is_valid) return deltaq_rdmult;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
+    return deltaq_rdmult;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
 
   const int mi_col_sr =
@@ -116,7 +115,6 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int row, col;
   double base_block_count = 0.0;
   double geom_mean_of_scale = 0.0;
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col_sr / num_mi_h;
@@ -124,7 +122,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
          col < sb_bcol_end;
          ++col) {
       const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
+      geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
       base_block_count += 1.0;
     }
   }
@@ -132,8 +130,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
   rdmult = AOMMAX(rdmult, 0);
   av1_set_error_per_bit(&x->errorperbit, rdmult);
-  aom_clear_system_state();
-  if (bsize == cm->seq_params.sb_size) {
+  if (bsize == cm->seq_params->sb_size) {
     const int rdmult_sb = set_deltaq_rdmult(cpi, x);
     assert(rdmult_sb == rdmult);
     (void)rdmult_sb;
@@ -341,7 +338,7 @@ void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
 
   const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
-  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+  if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
     av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
@@ -604,9 +601,9 @@ static void set_partial_sb_partition(const AV1_COMMON *const cm,
                                      MB_MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+  for (r = 0; r < cm->seq_params->mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+    for (c = 0; c < cm->seq_params->mib_size; c += bw) {
       const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
       const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
       mib[grid_index] = mi + mi_index;
@@ -638,11 +635,11 @@ void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
   assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
   // Apply the requested partition size to the SB if it is all "in image"
-  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
-      (mi_rows_remaining >= cm->seq_params.mib_size)) {
-    for (int block_row = 0; block_row < cm->seq_params.mib_size;
+  if ((mi_cols_remaining >= cm->seq_params->mib_size) &&
+      (mi_rows_remaining >= cm->seq_params->mib_size)) {
+    for (int block_row = 0; block_row < cm->seq_params->mib_size;
          block_row += bh) {
-      for (int block_col = 0; block_col < cm->seq_params.mib_size;
+      for (int block_col = 0; block_col < cm->seq_params->mib_size;
            block_col += bw) {
         const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
         const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
@@ -682,25 +679,25 @@ int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
 int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int orig_rdmult) {
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  int tpl_stride = tpl_frame->stride;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
 
-  if (tpl_frame->is_valid == 0) return orig_rdmult;
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return orig_rdmult;
 
-  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return orig_rdmult;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  if (!tpl_frame->is_valid) return orig_rdmult;
 
-  if (cpi->gf_group.index >= MAX_TPL_FRAME_IDX) return orig_rdmult;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return orig_rdmult;
 
   int mi_count = 0;
   const int mi_col_sr =
@@ -727,8 +724,6 @@ int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
   }
   assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
 
-  aom_clear_system_state();
-
   double beta = 1.0;
   if (mc_dep_cost > 0 && intra_cost > 0) {
     const double r0 = cpi->rd.r0;
@@ -738,8 +733,6 @@ int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
   int rdmult = av1_get_adaptive_rdmult(cpi, beta);
 
-  aom_clear_system_state();
-
   rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
   rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
 
@@ -760,7 +753,7 @@ int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
   if (is_stat_consumption_stage_twopass(cpi)) {
     const AV1_COMMON *const cm = &cpi->common;
     const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
     if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
 
     // The inactive region is specified in MBs not mi units.
@@ -790,7 +783,7 @@ int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
   if (is_stat_consumption_stage_twopass(cpi)) {
     const AV1_COMMON *const cm = &cpi->common;
     const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
     if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
 
     // The inactive region is specified in MBs not mi units.
@@ -814,24 +807,26 @@ void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
   if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
   if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
-  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
   if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
     return;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
 
   AV1_COMMON *const cm = &cpi->common;
-  const int gf_group_index = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  int tpl_stride = tpl_frame->stride;
+  const int gf_group_index = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
 
-  if (tpl_frame->is_valid == 0) return;
   if (gf_group_index >= MAX_TPL_FRAME_IDX) return;
 
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  if (!tpl_frame->is_valid) return;
+
   int mi_count = 0;
   int count = 0;
   const int mi_col_sr =
@@ -889,26 +884,26 @@ void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                    int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  int tpl_stride = tpl_frame->stride;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
   const int base_qindex = cm->quant_params.base_qindex;
 
-  if (tpl_frame->is_valid == 0) return base_qindex;
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex;
 
-  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return base_qindex;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  if (!tpl_frame->is_valid) return base_qindex;
 
-  if (cpi->gf_group.index >= MAX_TPL_FRAME_IDX) return base_qindex;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return base_qindex;
 
   int mi_count = 0;
   const int mi_col_sr =
@@ -935,8 +930,6 @@ int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
   }
   assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
 
-  aom_clear_system_state();
-
   int offset = 0;
   double beta = 1.0;
   if (mc_dep_cost > 0 && intra_cost > 0) {
@@ -945,8 +938,7 @@ int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
     beta = (r0 / rk);
     assert(beta > 0.0);
   }
-  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
-  aom_clear_system_state();
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
 
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
@@ -1164,7 +1156,7 @@ void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
 void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset) {
   unsigned int tmp_sse;
   unsigned int tmp_variance;
-  const BLOCK_SIZE bsize = cpi->common.seq_params.sb_size;
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
   uint8_t *src_y = cpi->source->y_buffer;
   int src_ystride = cpi->source->y_stride;
   uint8_t *last_src_y = cpi->last_source->y_buffer;
@@ -1178,8 +1170,8 @@ void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset) {
 #endif
   src_y += offset;
   last_src_y += offset;
-  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
-                                       last_src_ystride, &tmp_sse);
+  tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                            last_src_ystride, &tmp_sse);
   if (tmp_sse < avg_source_sse_threshold)
     x->content_state_sb.source_sad = kLowSad;
   else if (tmp_sse > avg_source_sse_threshold_high)
@@ -1233,7 +1225,7 @@ void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
 
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   xd->above_txfm_context =
       cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
@@ -1269,7 +1261,7 @@ void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
 
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
                       num_planes);
@@ -1294,33 +1286,32 @@ void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
 #endif  // CONFIG_INTERNAL_STATS
 }
 
-// Checks for skip status of mv cost update.
-static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
-                               const int mi_row, const int mi_col) {
-  // For intra frames, mv cdfs are not updated during the encode. Hence, the mv
-  // cost calculation is skipped in this case.
-  if (frame_is_intra_only(&cpi->common)) return 1;
-  // mv_cost_upd_level=0: update happens at each sb,
-  //                      so return skip status as 0.
-  // mv_cost_upd_level=1: update happens once for each sb row,
-  //                      so return skip status as 1 for
-  //                      mi_col != tile_info->mi_col_start.
-  // mv_cost_upd_level=2: update happens once for a set of rows,
-  //                      so return skip status as 1 appropriately.
-  if (!cpi->sf.inter_sf.mv_cost_upd_level) return 0;
+/*! Checks whether to skip updating the entropy cost based on tile info.
+ *
+ * This function contains codes common to both \ref skip_mv_cost_update and
+ * \ref skip_dv_cost_update.
+ */
+static int skip_cost_update(const SequenceHeader *seq_params,
+                            const TileInfo *const tile_info, const int mi_row,
+                            const int mi_col,
+                            INTERNAL_COST_UPDATE_TYPE upd_level) {
+  if (upd_level == INTERNAL_COST_UPD_SB) return 0;
+  if (upd_level == INTERNAL_COST_UPD_OFF) return 1;
+
+  // upd_level is at most as frequent as each sb_row in a tile.
   if (mi_col != tile_info->mi_col_start) return 1;
-  if (cpi->sf.inter_sf.mv_cost_upd_level == 2) {
-    AV1_COMMON *const cm = &cpi->common;
-    const int mib_size_log2 = cm->seq_params.mib_size_log2;
+
+  if (upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+    const int mib_size_log2 = seq_params->mib_size_log2;
     const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
-    const int sb_size = cm->seq_params.mib_size * MI_SIZE;
+    const int sb_size = seq_params->mib_size * MI_SIZE;
     const int tile_height =
         (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
-    // When mv_cost_upd_level = 2, the cost update happens once for 2, 4 sb
-    // rows for sb size 128, sb size 64 respectively. However, as the update
-    // will not be equally spaced in smaller resolutions making it equally
-    // spaced by calculating (mv_num_rows_cost_update) the number of rows
-    // after which the cost update should happen.
+    // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens
+    // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However,
+    // as the update will not be equally spaced in smaller resolutions making
+    // it equally spaced by calculating (mv_num_rows_cost_update) the number of
+    // rows after which the cost update should happen.
     const int sb_size_update_freq_map[2] = { 2, 4 };
     const int update_freq_sb_rows =
         sb_size_update_freq_map[sb_size != MAX_SB_SIZE];
@@ -1337,6 +1328,32 @@ static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
   return 0;
 }
 
+// Checks for skip status of mv cost update.
+static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  // For intra frames, mv cdfs are not updated during the encode. Hence, the mv
+  // cost calculation is skipped in this case.
+  if (frame_is_intra_only(cm)) return 1;
+
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                          cpi->sf.inter_sf.mv_cost_upd_level);
+}
+
+// Checks for skip status of dv cost update.
+static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  // Intrabc is only applicable to intra frames. So skip if intrabc is not
+  // allowed.
+  if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) {
+    return 1;
+  }
+
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                          cpi->sf.intra_sf.dv_cost_upd_level);
+}
+
 // Update the rate costs of some symbols according to the frequency directed
 // by speed features
 void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
@@ -1355,6 +1372,9 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
       if (mi_col != tile_info->mi_col_start) break;
       AOM_FALLTHROUGH_INTENDED;
     case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.coeff_cost_upd_level == INTERNAL_COST_UPD_SBROW &&
+          mi_col != tile_info->mi_col_start)
+        break;
       av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
       break;
     default: assert(0);
@@ -1368,6 +1388,9 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
       if (mi_col != tile_info->mi_col_start) break;
       AOM_FALLTHROUGH_INTENDED;
     case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.mode_cost_upd_level == INTERNAL_COST_UPD_SBROW &&
+          mi_col != tile_info->mi_col_start)
+        break;
       av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
       break;
     default: assert(0);
@@ -1388,4 +1411,19 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
       break;
     default: assert(0);
   }
+
+  switch (cpi->oxcf.cost_upd_freq.dv) {
+    case COST_UPD_OFF:
+    case COST_UPD_TILE:  // Tile level
+      break;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      // Checks for skip status of dv cost update.
+      if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+      av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs);
+      break;
+    default: assert(0);
+  }
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h
index 7bdfad5cba..3096181885 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h
@@ -13,17 +13,68 @@
 #define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
 
 #include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
 
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encoder.h"
-#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define WRITE_FEATURE_TO_FILE 0
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part parition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB parition types.
+enum {
+  HORZ_A = 0,
+  HORZ_B,
+  VERT_A,
+  VERT_B,
+  NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular parition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+  int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
 enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
 
 enum {
@@ -218,47 +269,6 @@ static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
   return &p->stats_buf_ctx->stats_in_start[frm];
 }
 
-static BLOCK_SIZE dim_to_size(int dim) {
-  switch (dim) {
-    case 4: return BLOCK_4X4;
-    case 8: return BLOCK_8X8;
-    case 16: return BLOCK_16X16;
-    case 32: return BLOCK_32X32;
-    case 64: return BLOCK_64X64;
-    case 128: return BLOCK_128X128;
-    default: assert(0); return 0;
-  }
-}
-
-static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
-                                                  AV1_COMP *cpi, MACROBLOCK *x,
-                                                  const SPEED_FEATURES *sf,
-                                                  BLOCK_SIZE sb_size,
-                                                  int mi_row, int mi_col) {
-  const AV1_COMMON *cm = &cpi->common;
-
-  sb_enc->max_partition_size =
-      AOMMIN(sf->part_sf.default_max_partition_size,
-             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
-  sb_enc->min_partition_size =
-      AOMMAX(sf->part_sf.default_min_partition_size,
-             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
-  sb_enc->max_partition_size =
-      AOMMIN(sb_enc->max_partition_size, cm->seq_params.sb_size);
-  sb_enc->min_partition_size =
-      AOMMIN(sb_enc->min_partition_size, cm->seq_params.sb_size);
-
-  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
-    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
-
-    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
-    sb_enc->max_partition_size =
-        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
-                      sb_enc->max_partition_size),
-               sb_enc->min_partition_size);
-  }
-}
-
 int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int orig_rdmult);
 
@@ -335,6 +345,57 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
                            const TileInfo *const tile_info, const int mi_row,
                            const int mi_col);
 
+static AOM_INLINE void av1_dealloc_mb_data(struct AV1Common *cm,
+                                           struct macroblock *mb) {
+  if (mb->txfm_search_info.txb_rd_records) {
+    aom_free(mb->txfm_search_info.txb_rd_records);
+    mb->txfm_search_info.txb_rd_records = NULL;
+  }
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (mb->plane[plane].src_diff) {
+      aom_free(mb->plane[plane].src_diff);
+      mb->plane[plane].src_diff = NULL;
+    }
+  }
+  if (mb->e_mbd.seg_mask) {
+    aom_free(mb->e_mbd.seg_mask);
+    mb->e_mbd.seg_mask = NULL;
+  }
+  if (mb->winner_mode_stats) {
+    aom_free(mb->winner_mode_stats);
+    mb->winner_mode_stats = NULL;
+  }
+}
+
+static AOM_INLINE void av1_alloc_mb_data(struct AV1Common *cm,
+                                         struct macroblock *mb,
+                                         int use_nonrd_pick_mode) {
+  if (!use_nonrd_pick_mode) {
+    mb->txfm_search_info.txb_rd_records =
+        (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+  }
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int subsampling_xy =
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
+    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+    CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+                    (int16_t *)aom_memalign(
+                        32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+  }
+  CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
+                  (uint8_t *)aom_memalign(
+                      16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
+  const int winner_mode_count = frame_is_intra_only(cm)
+                                    ? MAX_WINNER_MODE_COUNT_INTRA
+                                    : MAX_WINNER_MODE_COUNT_INTER;
+  CHECK_MEM_ERROR(cm, mb->winner_mode_stats,
+                  (WinnerModeStats *)aom_malloc(
+                      winner_mode_count * sizeof(mb->winner_mode_stats[0])));
+}
+
 // This function will compute the number of reference frames to be disabled
 // based on selective_ref_frame speed feature.
 static AOM_INLINE unsigned int get_num_refs_to_disable(
@@ -359,7 +420,7 @@ static AOM_INLINE unsigned int get_num_refs_to_disable(
 #if !CONFIG_REALTIME_ONLY
       else if (is_stat_consumption_stage_twopass(cpi)) {
         const FIRSTPASS_STATS *const this_frame_stats =
-            read_one_frame_stats(&cpi->twopass, cur_frame_display_index);
+            read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index);
         aom_clear_system_state();
         const double coded_error_per_mb =
             this_frame_stats->coded_error / cpi->frame_info.num_mbs;
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemb.c b/third_party/libaom/source/libaom/av1/encoder/encodemb.c
index c9ee22034b..2a875e1223 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodemb.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encodemb.c
@@ -35,19 +35,19 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 
-void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                        int16_t *diff, ptrdiff_t diff_stride,
-                        const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride) {
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride) {
   assert(rows >= 4 && cols >= 4);
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
+  if (bd_info.use_highbitdepth_buf) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
-                              pred8, pred_stride, xd->bd);
+                              pred8, pred_stride, bd_info.bit_depth);
     return;
   }
 #endif
-  (void)xd;
+  (void)bd_info;
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
 }
@@ -55,6 +55,7 @@ void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const int diff_stride = block_size_wide[plane_bsize];
@@ -66,8 +67,8 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
   uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
   int16_t *src_diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
-  av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
-                     src_stride, dst, dst_stride);
+  av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
+                     src, src_stride, dst, dst_stride);
 }
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
@@ -77,9 +78,10 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
 
-  av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
-                     pd->dst.buf, pd->dst.stride);
+  av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
+                     p->src.stride, pd->dst.buf, pd->dst.stride);
 }
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
@@ -132,13 +134,8 @@ const int DROPOUT_MULTIPLIER_Q_BASE = 32;  // Base Q to compute multiplier.
 
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex) {
-  const struct macroblock_plane *const p = &mb->plane[plane];
-  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
-  const int max_eob = av1_get_max_eob(tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
 
   // Early return if `qindex` is out of range.
   if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
@@ -156,6 +153,19 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       multiplier *
       CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
 
+  av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before,
+                         dropout_num_after);
+}
+
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after) {
+  const struct macroblock_plane *const p = &mb->plane[plane];
+  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
   // Early return if there are not enough non-zero coefficients.
   if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) {
     return;
@@ -172,7 +182,8 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
 
   for (int i = 0; i < p->eobs[block]; ++i) {
     const int scan_idx = scan_order->scan[i];
-    if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) {  // Keep large coefficients.
+    if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) {
+      // Keep large coefficients.
       count_zeros_before = 0;
       count_zeros_after = 0;
       idx = -1;
@@ -197,6 +208,7 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
     if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
       count_zeros_before = 0;
       count_zeros_after = 0;
+      count_nonzeros = 0;
       idx = -1;
       eob = i + 1;
     }
@@ -513,15 +525,17 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int step = bsh * bsw;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
     assert(bsw > 0 && bsh > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
 
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
         encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
                            arg, dry_run);
         block += step;
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemb.h b/third_party/libaom/source/libaom/av1/encoder/encodemb.h
index fcd34a3908..f2dc956a65 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodemb.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encodemb.h
@@ -123,11 +123,16 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
 //   `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex);
-
-void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                        int16_t *diff, ptrdiff_t diff_stride,
-                        const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride);
+// Same as above, with the number of zeroes needed before/after a coeff to drop
+// it explicitly passed in, instead of being derived from qindex.
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after);
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemv.c b/third_party/libaom/source/libaom/av1/encoder/encodemv.c
index 86c6156d8f..4a7d87408c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodemv.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encodemv.c
@@ -173,8 +173,8 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
-                   nmv_context *mvctx, int usehp) {
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
   // If the mv_diff is zero, then we should have used near or nearest instead.
@@ -193,8 +193,7 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
   // motion vector component used.
   if (cpi->sf.mv_sf.auto_mv_step_size) {
     int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->mv_search_params.max_mv_magnitude =
-        AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude);
+    td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude);
   }
 }
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemv.h b/third_party/libaom/source/libaom/av1/encoder/encodemv.h
index 9f0d607295..962844bc79 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodemv.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encodemv.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
-                   nmv_context *mvctx, int usehp);
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp);
 
 void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
                          MvSubpelPrecision precision);
diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder.c b/third_party/libaom/source/libaom/av1/encoder/encoder.c
index 955d15631c..41122ef45b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encoder.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encoder.c
@@ -51,6 +51,7 @@
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encode_strategy.h"
@@ -81,10 +82,6 @@
 
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
-#if CONFIG_ENTROPY_STATS
-FRAME_COUNTS aggregate_fc;
-#endif  // CONFIG_ENTROPY_STATS
-
 // #define OUTPUT_YUV_REC
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
@@ -228,7 +225,7 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm,
   const int upscaled_width = cm->superres_upscaled_width;
   const int height = cm->height;
   const int luma_pic_size = upscaled_width * height;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int pic_size_profile_factor =
       profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
@@ -242,7 +239,7 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm,
 static void set_tile_info(AV1_COMMON *const cm,
                           const TileConfig *const tile_cfg) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   int i, start_sb;
 
@@ -298,7 +295,7 @@ void av1_update_frame_size(AV1_COMP *cpi) {
 
   // We need to reallocate the context buffers here in case we need more mis.
   if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate context buffers");
   }
   av1_init_mi_buffers(&cm->mi_params);
@@ -308,8 +305,10 @@ void av1_update_frame_size(AV1_COMP *cpi) {
   if (!is_stat_generation_stage(cpi))
     alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
 
-  if (!cpi->seq_params_locked)
-    set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
+  if (!cpi->ppi->seq_params_locked)
+    set_sb_size(cm->seq_params,
+                av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                   cpi->svc.number_spatial_layers));
 
   set_tile_info(cm, &cpi->oxcf.tile_cfg);
 }
@@ -327,9 +326,9 @@ static INLINE int does_level_match(int width, int height, double fps,
          height <= lvl_height * lvl_dim_mult;
 }
 
-static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
-                                     int width, int height,
-                                     double init_framerate) {
+static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
+                                     int height, double init_framerate) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
   // TODO(any): This is a placeholder function that only addresses dimensions
   // and max display sample rates.
   // Need to add checks for max bit rate, max decoded luma sample rate, header
@@ -372,26 +371,26 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
     level = SEQ_LEVEL_6_2;
   }
 
-  SequenceHeader *const seq_params = &cm->seq_params;
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    seq->seq_level_idx[i] = level;
+    seq_params->seq_level_idx[i] = level;
     // Set the maximum parameters for bitrate and buffer size for this profile,
     // level, and tier
     seq_params->op_params[i].bitrate = av1_max_level_bitrate(
-        cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
+        seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]);
     // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
     // check
     if (seq_params->op_params[i].bitrate == 0)
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          &ppi->error, AOM_CODEC_UNSUP_BITSTREAM,
           "AV1 does not support this combination of profile, level, and tier.");
     // Buffer size in bits/s is bitrate in bits/s * 1 s
     seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
   }
 }
 
-void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
                                const AV1EncoderConfig *oxcf, int use_svc) {
+  SequenceHeader *const seq = &ppi->seq_params;
   const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
   const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
 
@@ -449,7 +448,7 @@ void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
   seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
   seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
 
-  set_bitstream_level_tier(seq, cm, frm_dim_cfg->width, frm_dim_cfg->height,
+  set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height,
                            oxcf->input_cfg.init_framerate);
 
   if (seq->operating_points_cnt_minus_1 == 0) {
@@ -461,26 +460,27 @@ void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
     // skip decoding enhancement  layers (temporal first).
     int i = 0;
     assert(seq->operating_points_cnt_minus_1 ==
-           (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1));
-    for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) {
-      for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) {
+           (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+    for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
+      for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
         seq->operating_point_idc[i] =
-            (~(~0u << (cm->number_spatial_layers - sl)) << 8) |
-            ~(~0u << (cm->number_temporal_layers - tl));
+            (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
+            ~(~0u << (ppi->number_temporal_layers - tl));
         i++;
       }
     }
   }
 }
 
-static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+static void init_config_sequence(struct AV1_PRIMARY *ppi,
+                                 AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
   const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
   const ColorCfg *const color_cfg = &oxcf->color_cfg;
-  cpi->oxcf = *oxcf;
-  cpi->framerate = oxcf->input_cfg.init_framerate;
+
+  ppi->use_svc = 0;
+  ppi->number_spatial_layers = 1;
+  ppi->number_temporal_layers = 1;
 
   seq_params->profile = oxcf->profile;
   seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
@@ -508,7 +508,7 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
         dec_model_cfg->num_units_in_decoding_tick;
-    cm->buffer_removal_time_present = 1;
+    ppi->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
   } else if (seq_params->timing_info_present &&
@@ -546,11 +546,19 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
       }
     }
   }
+  av1_change_config_seq(ppi, oxcf, NULL);
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->input_cfg.init_framerate;
 
   cm->width = oxcf->frm_dim_cfg.width;
   cm->height = oxcf->frm_dim_cfg.height;
-  set_sb_size(seq_params,
-              av1_select_sb_size(cpi));  // set sb size before allocations
+
   alloc_compressor_data(cpi);
 
   av1_update_film_grain_parameters(cpi, oxcf);
@@ -559,18 +567,15 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   cpi->td.counts = &cpi->counts;
 
   // Set init SVC parameters.
-  cpi->use_svc = 0;
-  cpi->svc.external_ref_frame_config = 0;
+  cpi->svc.set_ref_frame_config = 0;
   cpi->svc.non_reference_frame = 0;
   cpi->svc.number_spatial_layers = 1;
   cpi->svc.number_temporal_layers = 1;
-  cm->number_spatial_layers = 1;
-  cm->number_temporal_layers = 1;
   cm->spatial_layer_id = 0;
   cm->temporal_layer_id = 0;
 
   // change includes all joint functionality
-  av1_change_config(cpi, oxcf);
+  av1_change_config(cpi, oxcf, true);
 
   cpi->ref_frame_flags = 0;
 
@@ -583,25 +588,13 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
 }
 
-void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  RATE_CONTROL *const rc = &cpi->rc;
-  MACROBLOCK *const x = &cpi->td.mb;
-  AV1LevelParams *const level_params = &cpi->level_params;
-  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
-  RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+void av1_change_config_seq(struct AV1_PRIMARY *ppi,
+                           const AV1EncoderConfig *oxcf,
+                           bool *is_sb_size_changed) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
   const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
   const ColorCfg *const color_cfg = &oxcf->color_cfg;
-  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
-  // in case of LAP, lag in frames is set according to number of lap buffers
-  // calculated at init time. This stores and restores LAP's lag in frames to
-  // prevent override by new cfg.
-  int lap_lag_in_frames = -1;
-  if (cpi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
-    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
-  }
 
   if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
   seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
@@ -632,7 +625,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
         dec_model_cfg->num_units_in_decoding_tick;
-    cm->buffer_removal_time_present = 1;
+    ppi->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
   } else if (seq_params->timing_info_present &&
@@ -645,6 +638,56 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
         10;  // Default value (not signaled)
   }
 
+  av1_update_film_grain_parameters_seq(ppi, oxcf);
+
+  int sb_size = seq_params->sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!ppi->seq_params_locked) {
+    set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width,
+                                               frm_dim_cfg->height,
+                                               ppi->number_spatial_layers));
+    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+  }
+  if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size)
+    *is_sb_size_changed = true;
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!ppi->seq_params_locked) {
+    seq_params->operating_points_cnt_minus_1 =
+        (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
+            ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
+            : 0;
+    av1_init_seq_coding_tools(ppi, oxcf, ppi->use_svc);
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(ppi);
+#endif
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool is_sb_size_changed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+  RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  // in case of LAP, lag in frames is set according to number of lap buffers
+  // calculated at init time. This stores and restores LAP's lag in frames to
+  // prevent override by new cfg.
+  int lap_lag_in_frames = -1;
+  if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+  }
+
   av1_update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
@@ -680,10 +723,10 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
                         seq_params->tier[0]);
   }
 
-  if ((has_no_stats_stage(cpi)) && (rc_cfg->mode == AOM_Q)) {
-    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) {
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
   } else {
-    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+    p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
   refresh_frame_flags->golden_frame = false;
@@ -720,16 +763,23 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
     }
   }
 
+  if (x->pixel_gradient_info == NULL) {
+    const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+    CHECK_MEM_ERROR(cm, x->pixel_gradient_info,
+                    aom_malloc(sizeof(*x->pixel_gradient_info) * plane_types *
+                               MAX_SB_SQUARE));
+  }
+
   av1_reset_segment_features(cm);
 
   av1_set_high_precision_mv(cpi, 1, 0);
 
-  set_rc_buffer_sizes(rc, rc_cfg);
+  set_rc_buffer_sizes(cpi);
 
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, p_rc->maximum_buffer_size);
+  rc->buffer_level = AOMMIN(rc->buffer_level, p_rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   av1_new_framerate(cpi, cpi->framerate);
@@ -752,18 +802,9 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cm->width = frm_dim_cfg->width;
   cm->height = frm_dim_cfg->height;
 
-  int sb_size = seq_params->sb_size;
-  // Superblock size should not be updated after the first key frame.
-  if (!cpi->seq_params_locked) {
-    set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
-    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
-      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
-  }
-
-  if (initial_dimensions->width || sb_size != seq_params->sb_size) {
+  if (initial_dimensions->width || is_sb_size_changed) {
     if (cm->width > initial_dimensions->width ||
-        cm->height > initial_dimensions->height ||
-        seq_params->sb_size != sb_size) {
+        cm->height > initial_dimensions->height || is_sb_size_changed) {
       av1_free_context_buffers(cm);
       av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
       av1_free_sms_tree(&cpi->td);
@@ -780,27 +821,15 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   set_tile_info(cm, &cpi->oxcf.tile_cfg);
 
-  if (!cpi->svc.external_ref_frame_config)
+  if (!cpi->svc.set_ref_frame_config)
     cpi->ext_flags.refresh_frame.update_pending = 0;
   cpi->ext_flags.refresh_frame_context_pending = 0;
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  highbd_set_var_fns(cpi);
-#endif
-
-  // Init sequence level coding tools
-  // This should not be called after the first key frame.
-  if (!cpi->seq_params_locked) {
-    seq_params->operating_points_cnt_minus_1 =
-        (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1)
-            ? cm->number_spatial_layers * cm->number_temporal_layers - 1
-            : 0;
-    av1_init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
-  }
-
-  if (cpi->use_svc)
+  if (cpi->ppi->use_svc)
     av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
 
+  check_reset_rc_flag(cpi);
+
   // restore the value of lag_in_frame for LAP stage.
   if (lap_lag_in_frames != -1) {
     cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
@@ -810,7 +839,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 static INLINE void init_frame_info(FRAME_INFO *frame_info,
                                    const AV1_COMMON *const cm) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   frame_info->frame_width = cm->width;
   frame_info->frame_height = cm->height;
   frame_info->mi_cols = mi_params->mi_cols;
@@ -834,73 +863,44 @@ static INLINE void update_frame_index_set(FRAME_INDEX_SET *frame_index_set,
   }
 }
 
-AV1_PRIMARY *av1_create_primary_compressor() {
+AV1_PRIMARY *av1_create_primary_compressor(
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    AV1EncoderConfig *oxcf) {
   AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY));
   if (!ppi) return NULL;
   av1_zero(*ppi);
 
-  return ppi;
-}
-
-AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
-                                BufferPool *const pool,
-                                FIRSTPASS_STATS *frame_stats_buf,
-                                COMPRESSOR_STAGE stage, int num_lap_buffers,
-                                int lap_lag_in_frames,
-                                STATS_BUFFER_CTX *stats_buf_context) {
-  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
-  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
-
-  if (!cm) return NULL;
-
-  av1_zero(*cpi);
-
-  cpi->ppi = ppi;
-
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
-    cm->error.setjmp = 0;
-    av1_remove_compressor(cpi);
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    av1_remove_primary_compressor(ppi);
     return 0;
   }
+  ppi->error.setjmp = 1;
 
-  cm->error.setjmp = 1;
-  cpi->lap_enabled = num_lap_buffers > 0;
-  cpi->compressor_stage = stage;
-
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-  mi_params->free_mi = enc_free_mi;
-  mi_params->setup_mi = enc_setup_mi;
-  mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE)
-                             ? stat_stage_set_mb_mi
-                             : enc_set_mb_mi;
-
-  mi_params->mi_alloc_bsize = BLOCK_4X4;
-
-  CHECK_MEM_ERROR(cm, cm->fc,
-                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
-  CHECK_MEM_ERROR(
-      cm, cm->default_frame_context,
-      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
-  memset(cm->fc, 0, sizeof(*cm->fc));
-  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
-
-  cpi->common.buffer_pool = pool;
+  ppi->seq_params_locked = 0;
+  ppi->lap_enabled = num_lap_buffers > 0;
+  ppi->output_pkt_list = pkt_list_head;
+  ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+  ppi->frames_left = oxcf->input_cfg.limit;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  ppi->max_mv_magnitude = 0;
+  ppi->num_fp_contexts = 1;
+#endif
 
-  init_config(cpi, oxcf);
-  if (cpi->compressor_stage == LAP_STAGE) {
-    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
-  }
+  init_config_sequence(ppi, oxcf);
 
-  cpi->frames_left = cpi->oxcf.input_cfg.limit;
+#if CONFIG_ENTROPY_STATS
+  av1_zero(ppi->aggregate_fc);
+#endif  // CONFIG_ENTROPY_STATS
 
-  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  av1_primary_rc_init(oxcf, &ppi->p_rc);
 
   // For two pass and lag_in_frames > 33 in LAP.
-  cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
-  if (cpi->lap_enabled) {
+  ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+  if (ppi->lap_enabled) {
     if ((num_lap_buffers <
          (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
         num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
@@ -908,219 +908,22 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
        * For lag in frames >= 19 and <33, enable scenecut
        * with limited future frame prediction.
        */
-      cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+      ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
     } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
       // Disable scenecut when lag_in_frames < 19.
-      cpi->rc.enable_scenecut_detection = DISABLE_SCENECUT;
+      ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT;
     }
   }
-  init_frame_info(&cpi->frame_info, cm);
-  init_frame_index_set(&cpi->frame_index_set);
-
-  cm->current_frame.frame_number = 0;
-  cm->current_frame_id = -1;
-  cpi->seq_params_locked = 0;
-  cpi->partition_search_skippable_frame = 0;
-  cpi->tile_data = NULL;
-  cpi->last_show_frame_buf = NULL;
-  realloc_segmentation_maps(cpi);
-
-  cpi->refresh_frame.alt_ref_frame = false;
-
-  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
-#if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_blockiness = 1;
-  cpi->b_calculate_consistency = 1;
-  cpi->total_inconsistency = 0;
-  cpi->psnr[0].worst = 100.0;
-  cpi->psnr[1].worst = 100.0;
-  cpi->worst_ssim = 100.0;
-  cpi->worst_ssim_hbd = 100.0;
-
-  cpi->count[0] = 0;
-  cpi->count[1] = 0;
-  cpi->bytes = 0;
-#if CONFIG_SPEED_STATS
-  cpi->tx_search_count = 0;
-#endif  // CONFIG_SPEED_STATS
-
-  if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error[0] = 0;
-    cpi->total_samples[0] = 0;
-    cpi->total_sq_error[1] = 0;
-    cpi->total_samples[1] = 0;
-    cpi->tot_recode_hits = 0;
-    cpi->summed_quality = 0;
-    cpi->summed_weights = 0;
-    cpi->summed_quality_hbd = 0;
-    cpi->summed_weights_hbd = 0;
-  }
-
-  cpi->fastssim.worst = 100.0;
-  cpi->psnrhvs.worst = 100.0;
-
-  if (cpi->b_calculate_blockiness) {
-    cpi->total_blockiness = 0;
-    cpi->worst_blockiness = 0.0;
-  }
-
-  if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(
-        cm, cpi->ssim_vars,
-        aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows *
-                   cpi->common.mi_params.mi_cols));
-    cpi->worst_consistency = 100.0;
-  }
-#endif
-#if CONFIG_ENTROPY_STATS
-  av1_zero(aggregate_fc);
-#endif  // CONFIG_ENTROPY_STATS
-
-  cpi->time_stamps.first_ts_start = INT64_MAX;
-
-#ifdef OUTPUT_YUV_REC
-  yuv_rec_file = fopen("rec.yuv", "wb");
-#endif
-#ifdef OUTPUT_YUV_DENOISED
-  yuv_denoised_file = fopen("denoised.yuv", "wb");
-#endif
-
-  assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
-  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
-  for (int i = 0; i < size; i++)
-    cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i];
-
-  cpi->twopass.stats_buf_ctx = stats_buf_context;
-  cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
-
-#if !CONFIG_REALTIME_ONLY
-  if (is_stat_consumption_stage(cpi)) {
-    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
-
-    if (!cpi->lap_enabled) {
-      /*Re-initialize to stats buffer, populated by application in the case of
-       * two pass*/
-      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->twopass_stats_in.buf;
-      cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
-      cpi->twopass.stats_buf_ctx->stats_in_end =
-          &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
-
-      av1_init_second_pass(cpi);
-    } else {
-      av1_init_single_pass_lap(cpi);
-    }
-  }
-#endif
-
-  alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm);
-
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.inter_modes_info,
-      (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
-
-  for (int x = 0; x < 2; x++)
-    for (int y = 0; y < 2; y++)
-      CHECK_MEM_ERROR(
-          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
-          (uint32_t *)aom_malloc(
-              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
-              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
-
-  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
-
-  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
-  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
-
-  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
-                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
-                             sizeof(*cpi->consec_zero_mv)));
-
-  {
-    const int bsize = BLOCK_16X16;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
-    CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->tpl_sb_rdmult_scaling_factors)));
-  }
-
-  {
-    const int bsize = BLOCK_16X16;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
-  }
-
-#if CONFIG_TUNE_VMAF
-  {
-    const int bsize = BLOCK_64X64;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
-    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
-      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
-      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
-      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
-    }
-    cpi->vmaf_info.original_qindex = -1;
-
-#if CONFIG_USE_VMAF_RC
-    cpi->vmaf_info.vmaf_model = NULL;
-#endif
-  }
-#endif
-
-#if CONFIG_TUNE_BUTTERAUGLI
-  {
-    const int w = mi_size_wide[butteraugli_rdo_bsize];
-    const int h = mi_size_high[butteraugli_rdo_bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(
-        cm, cpi->butteraugli_info.rdmult_scaling_factors,
-        aom_malloc(num_rows * num_cols *
-                   sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
-    memset(&cpi->butteraugli_info.source, 0,
-           sizeof(cpi->butteraugli_info.source));
-    memset(&cpi->butteraugli_info.resized_source, 0,
-           sizeof(cpi->butteraugli_info.resized_source));
-    cpi->butteraugli_info.recon_set = false;
-  }
-#endif
-
-#if !CONFIG_REALTIME_ONLY
-  if (!is_stat_generation_stage(cpi)) {
-    av1_setup_tpl_buffers(cm, &cpi->tpl_data, cpi->oxcf.gf_cfg.lag_in_frames);
-  }
-#endif
-
-#if CONFIG_COLLECT_PARTITION_STATS
-  av1_zero(cpi->partition_stats);
-#endif  // CONFIG_COLLECT_PARTITION_STATS
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
+  ppi->fn_ptr[BT].sdf = SDF;                                    \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                  \
+  ppi->fn_ptr[BT].vf = VF;                                      \
+  ppi->fn_ptr[BT].svf = SVF;                                    \
+  ppi->fn_ptr[BT].svaf = SVAF;                                  \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
 
 // Realtime mode doesn't use 4x rectangular blocks.
 #if !CONFIG_REALTIME_ONLY
@@ -1233,9 +1036,9 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
 
 #if !CONFIG_REALTIME_ONLY
 #define OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;    \
-  cpi->fn_ptr[BT].ovf = OVF;      \
-  cpi->fn_ptr[BT].osvf = OSVF;
+  ppi->fn_ptr[BT].osdf = OSDF;    \
+  ppi->fn_ptr[BT].ovf = OVF;      \
+  ppi->fn_ptr[BT].osvf = OSVF;
 
   OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
        aom_obmc_sub_pixel_variance128x128)
@@ -1284,8 +1087,8 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
 #endif  // !CONFIG_REALTIME_ONLY
 
 #define MBFP(BT, MCSDF, MCSVF)  \
-  cpi->fn_ptr[BT].msdf = MCSDF; \
-  cpi->fn_ptr[BT].msvf = MCSVF;
+  ppi->fn_ptr[BT].msdf = MCSDF; \
+  ppi->fn_ptr[BT].msvf = MCSVF;
 
   MBFP(BLOCK_128X128, aom_masked_sad128x128,
        aom_masked_sub_pixel_variance128x128)
@@ -1315,8 +1118,8 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
 #endif
 
 #define SDSFP(BT, SDSF, SDSX4DF) \
-  cpi->fn_ptr[BT].sdsf = SDSF;   \
-  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+  ppi->fn_ptr[BT].sdsf = SDSF;   \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
   SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d);
   SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d);
@@ -1346,16 +1149,281 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
 #undef SDSFP
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  highbd_set_var_fns(cpi);
+  highbd_set_var_fns(ppi);
+#endif
+
+  {
+    // As cm->mi_params is a part of the frame level context (cpi), it is
+    // unavailable at this point. mi_params is created as a local temporary
+    // variable, to be passed into the functions used for allocating tpl
+    // buffers. The values in this variable are populated according to initial
+    // width and height of the frame.
+    CommonModeInfoParams mi_params;
+    enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width,
+                  oxcf->frm_dim_cfg.height);
+
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params.mi_cols + w - 1) / w;
+    const int num_rows = (mi_params.mi_rows + h - 1) / h;
+    AOM_CHECK_MEM_ERROR(&ppi->error, ppi->tpl_rdmult_scaling_factors,
+                        aom_calloc(num_rows * num_cols,
+                                   sizeof(*ppi->tpl_rdmult_scaling_factors)));
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, ppi->tpl_sb_rdmult_scaling_factors,
+        aom_calloc(num_rows * num_cols,
+                   sizeof(*ppi->tpl_sb_rdmult_scaling_factors)));
+
+#if !CONFIG_REALTIME_ONLY
+    if (oxcf->pass != 1) {
+      av1_setup_tpl_buffers(ppi, &mi_params, oxcf->frm_dim_cfg.width,
+                            oxcf->frm_dim_cfg.height, 0,
+                            oxcf->gf_cfg.lag_in_frames);
+    }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+    ppi->b_calculate_blockiness = 1;
+    ppi->b_calculate_consistency = 1;
+
+    for (int i = 0; i <= STAT_ALL; i++) {
+      ppi->psnr[0].stat[i] = 0;
+      ppi->psnr[1].stat[i] = 0;
+
+      ppi->fastssim.stat[i] = 0;
+      ppi->psnrhvs.stat[i] = 0;
+    }
+
+    ppi->psnr[0].worst = 100.0;
+    ppi->psnr[1].worst = 100.0;
+    ppi->worst_ssim = 100.0;
+    ppi->worst_ssim_hbd = 100.0;
+
+    ppi->count[0] = 0;
+    ppi->count[1] = 0;
+    ppi->total_bytes = 0;
+
+    if (ppi->b_calculate_psnr) {
+      ppi->total_sq_error[0] = 0;
+      ppi->total_samples[0] = 0;
+      ppi->total_sq_error[1] = 0;
+      ppi->total_samples[1] = 0;
+      ppi->total_recode_hits = 0;
+      ppi->summed_quality = 0;
+      ppi->summed_weights = 0;
+      ppi->summed_quality_hbd = 0;
+      ppi->summed_weights_hbd = 0;
+    }
+
+    ppi->fastssim.worst = 100.0;
+    ppi->psnrhvs.worst = 100.0;
+
+    if (ppi->b_calculate_blockiness) {
+      ppi->total_blockiness = 0;
+      ppi->worst_blockiness = 0.0;
+    }
+
+    ppi->total_inconsistency = 0;
+    ppi->worst_consistency = 100.0;
+    if (ppi->b_calculate_consistency) {
+      AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars,
+                          aom_malloc(sizeof(*ppi->ssim_vars) * 4 *
+                                     mi_params.mi_rows * mi_params.mi_cols));
+    }
+#endif
+  }
+
+  ppi->error.setjmp = 0;
+  return ppi;
+}
+
+AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
+                                BufferPool *const pool, COMPRESSOR_STAGE stage,
+                                int lap_lag_in_frames) {
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*cpi);
+
+  cpi->ppi = ppi;
+  cm->seq_params = &ppi->seq_params;
+  cm->error = &ppi->error;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    av1_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error->setjmp = 1;
+  cpi->compressor_stage = stage;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cpi->do_frame_data_update = true;
+#endif
+
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->free_mi = enc_free_mi;
+  mi_params->setup_mi = enc_setup_mi;
+  mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE)
+                             ? stat_stage_set_mb_mi
+                             : enc_set_mb_mi;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(
+      cm, cm->default_frame_context,
+      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  if (cpi->compressor_stage == LAP_STAGE) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+  }
+
+  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc, &cpi->ppi->p_rc);
+
+  init_frame_info(&cpi->frame_info, cm);
+  init_frame_index_set(&cpi->frame_index_set);
+
+  cm->current_frame.frame_number = 0;
+  cm->current_frame_id = -1;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf = NULL;
+  realloc_segmentation_maps(cpi);
+
+  cpi->refresh_frame.alt_ref_frame = false;
+
+#if CONFIG_SPEED_STATS
+  cpi->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+  cpi->time_stamps.first_ts_start = INT64_MAX;
+
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "wb");
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_stat_consumption_stage(cpi)) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
+
+    if (!cpi->ppi->lap_enabled) {
+      /*Re-initialize to stats buffer, populated by application in the case of
+       * two pass*/
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_start =
+          oxcf->twopass_stats_in.buf;
+      cpi->ppi->twopass.stats_in =
+          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_end =
+          &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+      av1_init_second_pass(cpi);
+    } else {
+      av1_init_single_pass_lap(cpi);
+    }
+  }
+#endif
+
+  alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm);
+
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.inter_modes_info,
+      (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
+
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(
+              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
+                             sizeof(*cpi->consec_zero_mv)));
+
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
+  }
+
+#if CONFIG_TUNE_VMAF
+  {
+    const int bsize = BLOCK_64X64;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+    }
+    cpi->vmaf_info.original_qindex = -1;
+    cpi->vmaf_info.vmaf_model = NULL;
+  }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  {
+    const int w = mi_size_wide[butteraugli_rdo_bsize];
+    const int h = mi_size_high[butteraugli_rdo_bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(
+        cm, cpi->butteraugli_info.rdmult_scaling_factors,
+        aom_malloc(num_rows * num_cols *
+                   sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
+    memset(&cpi->butteraugli_info.source, 0,
+           sizeof(cpi->butteraugli_info.source));
+    memset(&cpi->butteraugli_info.resized_source, 0,
+           sizeof(cpi->butteraugli_info.resized_source));
+    cpi->butteraugli_info.recon_set = false;
+  }
 #endif
 
+#if CONFIG_COLLECT_PARTITION_STATS
+  av1_zero(cpi->partition_stats);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
   /* av1_init_quantizer() is first called here. Add check in
    * av1_frame_init_quantizer() so that av1_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
    * av1_init_quantizer() for every frame.
    */
   av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                     cm->seq_params.bit_depth);
+                     cm->seq_params->bit_depth);
   av1_qm_init(&cm->quant_params, av1_num_planes(cm));
 
   av1_loop_filter_init(cm);
@@ -1365,7 +1433,7 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
 #if !CONFIG_REALTIME_ONLY
   av1_loop_restoration_precal();
 #endif
-  cm->error.setjmp = 0;
+  cm->error->setjmp = 0;
 
   return cpi;
 }
@@ -1402,6 +1470,7 @@ static AOM_INLINE void free_thread_data(AV1_COMP *cpi) {
     for (int j = 0; j < 2; ++j) {
       aom_free(thread_data->td->tmp_pred_bufs[j]);
     }
+    aom_free(thread_data->td->pixel_gradient_info);
     release_obmc_buffers(&thread_data->td->obmc_buffer);
     aom_free(thread_data->td->vt64x64);
 
@@ -1423,7 +1492,27 @@ static AOM_INLINE void free_thread_data(AV1_COMP *cpi) {
 
 void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
   if (!ppi) return;
+  aom_free_frame_buffer(&ppi->alt_ref_buffer);
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(ppi->level_params.level_info[i]);
+  }
   av1_lookahead_destroy(ppi->lookahead);
+
+  aom_free(ppi->tpl_rdmult_scaling_factors);
+  ppi->tpl_rdmult_scaling_factors = NULL;
+  aom_free(ppi->tpl_sb_rdmult_scaling_factors);
+  ppi->tpl_sb_rdmult_scaling_factors = NULL;
+
+  TplParams *const tpl_data = &ppi->tpl_data;
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    aom_free(tpl_data->tpl_stats_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
+
   aom_free(ppi);
 }
 
@@ -1432,127 +1521,6 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
   AV1_COMMON *cm = &cpi->common;
   if (cm->current_frame.frame_number > 0) {
-#if CONFIG_ENTROPY_STATS
-    if (!is_stat_generation_stage(cpi)) {
-      fprintf(stderr, "Writing counts.stt\n");
-      FILE *f = fopen("counts.stt", "wb");
-      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
-      fclose(f);
-    }
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_INTERNAL_STATS
-    aom_clear_system_state();
-
-    if (!is_stat_generation_stage(cpi)) {
-      char headings[512] = { 0 };
-      char results[512] = { 0 };
-      FILE *f = fopen("opsnr.stt", "a");
-      double time_encoded =
-          (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
-          10000000.000;
-      double total_encode_time =
-          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
-      const double dr =
-          (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
-      const double peak =
-          (double)((1 << cpi->oxcf.input_cfg.input_bit_depth) - 1);
-      const double target_rate =
-          (double)cpi->oxcf.rc_cfg.target_bandwidth / 1000;
-      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
-
-      if (cpi->b_calculate_psnr) {
-        const double total_psnr =
-            aom_sse_to_psnr((double)cpi->total_samples[0], peak,
-                            (double)cpi->total_sq_error[0]);
-        const double total_ssim =
-            100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-        snprintf(headings, sizeof(headings),
-                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
-                 "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
-                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
-        snprintf(results, sizeof(results),
-                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f",
-                 dr, cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr,
-                 cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr,
-                 total_ssim, total_ssim,
-                 cpi->fastssim.stat[STAT_ALL] / cpi->count[0],
-                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count[0],
-                 cpi->psnr[0].worst, cpi->worst_ssim, cpi->fastssim.worst,
-                 cpi->psnrhvs.worst, cpi->psnr[0].stat[STAT_Y] / cpi->count[0],
-                 cpi->psnr[0].stat[STAT_U] / cpi->count[0],
-                 cpi->psnr[0].stat[STAT_V] / cpi->count[0]);
-
-        if (cpi->b_calculate_blockiness) {
-          SNPRINT(headings, "\t  Block\tWstBlck");
-          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count[0]);
-          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
-        }
-
-        if (cpi->b_calculate_consistency) {
-          double consistency =
-              aom_sse_to_psnr((double)cpi->total_samples[0], peak,
-                              (double)cpi->total_inconsistency);
-
-          SNPRINT(headings, "\tConsist\tWstCons");
-          SNPRINT2(results, "\t%7.3f", consistency);
-          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
-        }
-
-        SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
-        SNPRINT2(results, "\t%8.0f", total_encode_time);
-        SNPRINT2(results, " %7.2f", rate_err);
-        SNPRINT2(results, " %7.2f", fabs(rate_err));
-
-        SNPRINT(headings, "\tAPsnr611");
-        SNPRINT2(results, " %7.3f",
-                 (6 * cpi->psnr[0].stat[STAT_Y] + cpi->psnr[0].stat[STAT_U] +
-                  cpi->psnr[0].stat[STAT_V]) /
-                     (cpi->count[0] * 8));
-
-#if CONFIG_AV1_HIGHBITDEPTH
-        const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
-        const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
-        if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
-            (in_bit_depth < bit_depth)) {
-          const double peak_hbd = (double)((1 << bit_depth) - 1);
-          const double total_psnr_hbd =
-              aom_sse_to_psnr((double)cpi->total_samples[1], peak_hbd,
-                              (double)cpi->total_sq_error[1]);
-          const double total_ssim_hbd =
-              100 * pow(cpi->summed_quality_hbd / cpi->summed_weights_hbd, 8.0);
-          SNPRINT(headings,
-                  "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
-                  " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
-                  " AOMSSIMH VPSSIMPH WstSsimH");
-          SNPRINT2(results, "\t%7.3f",
-                   cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f", total_psnr_hbd);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f", total_psnr_hbd);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_Y] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_U] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_V] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f", cpi->psnr[1].worst);
-          SNPRINT2(results, "  %7.3f", total_ssim_hbd);
-          SNPRINT2(results, "  %7.3f", total_ssim_hbd);
-          SNPRINT2(results, "  %7.3f", cpi->worst_ssim_hbd);
-        }
-#endif
-        fprintf(f, "%s\n", headings);
-        fprintf(f, "%s\n", results);
-      }
-
-      fclose(f);
-    }
-#endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
     if (!is_stat_generation_stage(cpi)) {
       fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
@@ -1571,12 +1539,6 @@ void av1_remove_compressor(AV1_COMP *cpi) {
   av1_denoiser_free(&(cpi->denoiser));
 #endif
 
-  TplParams *const tpl_data = &cpi->tpl_data;
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    aom_free(tpl_data->tpl_stats_pool[frame]);
-    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
-  }
-
   if (cpi->compressor_stage != LAP_STAGE) {
     terminate_worker_data(cpi);
     free_thread_data(cpi);
@@ -1586,6 +1548,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
   pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
   if (enc_row_mt_mutex_ != NULL) {
     pthread_mutex_destroy(enc_row_mt_mutex_);
     aom_free(enc_row_mt_mutex_);
@@ -1594,6 +1557,10 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     pthread_mutex_destroy(gm_mt_mutex_);
     aom_free(gm_mt_mutex_);
   }
+  if (pack_bs_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pack_bs_mt_mutex_);
+    aom_free(pack_bs_mt_mutex_);
+  }
 #endif
   av1_row_mt_mem_dealloc(cpi);
   if (cpi->compressor_stage != LAP_STAGE) {
@@ -1601,9 +1568,6 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     aom_free(mt_info->workers);
   }
 
-#if !CONFIG_REALTIME_ONLY
-  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
-#endif
   if (mt_info->num_workers > 1) {
     av1_loop_filter_dealloc(&mt_info->lf_row_sync);
     av1_cdef_mt_dealloc(&mt_info->cdef_sync);
@@ -1617,13 +1581,9 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
   dealloc_compressor_data(cpi);
 
-#if CONFIG_INTERNAL_STATS
-  aom_free(cpi->ssim_vars);
-  cpi->ssim_vars = NULL;
-#endif  // CONFIG_INTERNAL_STATS
+  av1_ext_part_delete(&cpi->ext_part_controller);
 
   av1_remove_common(cm);
-  av1_free_ref_frame_buffers(cm->buffer_pool);
 
   aom_free(cpi);
 
@@ -1667,7 +1627,7 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
 #endif
 
   pkt.kind = AOM_CODEC_PSNR_PKT;
-  aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt);
 }
 
 int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
@@ -1781,7 +1741,12 @@ static void set_mv_search_params(AV1_COMP *cpi) {
         mv_search_params->mv_step_param = av1_init_search_range(
             AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      // Reset max_mv_magnitude for parallel frames based on update flag.
+      if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1;
+#else
       mv_search_params->max_mv_magnitude = -1;
+#endif
     }
   }
 }
@@ -1789,14 +1754,14 @@ static void set_mv_search_params(AV1_COMP *cpi) {
 void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
   const AV1_COMMON *const cm = &cpi->common;
 
-  if (cm->seq_params.force_screen_content_tools != 2) {
+  if (cm->seq_params->force_screen_content_tools != 2) {
     features->allow_screen_content_tools = features->allow_intrabc =
-        cm->seq_params.force_screen_content_tools;
+        cm->seq_params->force_screen_content_tools;
     return;
   }
 
   if (cpi->oxcf.mode == REALTIME) {
-    assert(cm->seq_params.reduced_still_picture_hdr);
+    assert(cm->seq_params->reduced_still_picture_hdr);
     features->allow_screen_content_tools = features->allow_intrabc = 0;
     return;
   }
@@ -1814,7 +1779,7 @@ void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
   const int stride = cpi->unfiltered_source->y_stride;
   const int width = cpi->unfiltered_source->y_width;
   const int height = cpi->unfiltered_source->y_height;
-  const int bd = cm->seq_params.bit_depth;
+  const int bd = cm->seq_params->bit_depth;
   const int blk_w = 16;
   const int blk_h = 16;
   // These threshold values are selected experimentally.
@@ -1960,7 +1925,7 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) {
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
 
   if (!initial_dimensions->width ||
@@ -1994,11 +1959,11 @@ static void setup_denoiser_buffer(AV1_COMP *cpi) {
   if (cpi->oxcf.noise_sensitivity > 0 &&
       !cpi->denoiser.frame_buffer_initialized) {
     if (av1_denoiser_alloc(
-            cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+            cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc,
             cpi->oxcf.noise_sensitivity, cm->width, cm->height,
-            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+            cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate denoiser");
   }
 }
@@ -2008,9 +1973,9 @@ static void setup_denoiser_buffer(AV1_COMP *cpi) {
 int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
-  av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
-                          cm->seq_params.subsampling_x,
-                          cm->seq_params.subsampling_y);
+  av1_check_initial_width(cpi, cm->seq_params->use_highbitdepth,
+                          cm->seq_params->subsampling_x,
+                          cm->seq_params->subsampling_y);
 
   if (width <= 0 || height <= 0) return 1;
 
@@ -2040,7 +2005,7 @@ int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
 
 void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int ref_frame;
@@ -2078,7 +2043,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
     if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
                                         cm->mi_params.mi_cols,
                                         av1_num_planes(cm)))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
   }
 
@@ -2088,11 +2053,16 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
           NULL, cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
+  if (!is_stat_generation_stage(cpi))
+    av1_alloc_cdef_buffers(cm, &cpi->mt_info.cdef_worker,
+                           &cpi->mt_info.cdef_sync,
+                           cpi->mt_info.num_mod_workers[MOD_CDEF]);
+
 #if !CONFIG_REALTIME_ONLY
-  const int use_restoration = cm->seq_params.enable_restoration &&
+  const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
   if (use_restoration) {
@@ -2107,6 +2077,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
     av1_alloc_restoration_buffers(cm);
   }
 #endif
+
   if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
@@ -2145,13 +2116,22 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, cdef_time);
 #endif
+    const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
     // Find CDEF parameters
     av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
-                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
+                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult,
+                    cpi->sf.rt_sf.skip_cdef_sb, cpi->rc.frames_since_key);
 
     // Apply the filter
-    if (!cpi->sf.rt_sf.skip_loopfilter_non_reference)
-      av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+    if (!cpi->sf.rt_sf.skip_loopfilter_non_reference) {
+      if (num_workers > 1) {
+        av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
+                          cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
+                          num_workers, av1_cdef_init_fb_row_mt);
+      } else {
+        av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
+      }
+    }
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, cdef_time);
 #endif
@@ -2211,11 +2191,19 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
 
   const int use_loopfilter =
       !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_cdef = cm->seq_params.enable_cdef &&
+  const int use_cdef = cm->seq_params->enable_cdef &&
                        !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_restoration = cm->seq_params.enable_restoration &&
+  const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
+  const int cur_width = cm->cur_frame->width;
+  const int cur_height = cm->cur_frame->height;
+  const int cur_width_mib = cm->mi_params.mi_cols * MI_SIZE;
+  const int cur_height_mib = cm->mi_params.mi_rows * MI_SIZE;
+  const int is_realtime =
+      cpi->sf.rt_sf.use_nonrd_pick_mode && !(cm->mi_params.mi_cols % 2) &&
+      !(cm->mi_params.mi_rows % 2) && (cur_width_mib - cur_width < MI_SIZE) &&
+      (cur_height_mib - cur_height < MI_SIZE);
 
   struct loopfilter *lf = &cm->lf;
 
@@ -2238,13 +2226,13 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
                                0,
 #endif
                                mt_info->workers, num_workers,
-                               &mt_info->lf_row_sync);
+                               &mt_info->lf_row_sync, is_realtime);
     else
       av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
 #if CONFIG_LPF_MASK
                             0,
 #endif
-                            0, num_planes, 0);
+                            0, num_planes, 0, is_realtime);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_filter_time);
@@ -2278,16 +2266,17 @@ static int encode_without_recode(AV1_COMP *cpi) {
   int top_index = 0, bottom_index = 0, q = 0;
   YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
   InterpFilter filter_scaler =
-      cpi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
-                   : EIGHTTAP_SMOOTH;
-  int phase_scaler =
-      cpi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0;
+      cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+                        : EIGHTTAP_SMOOTH;
+  int phase_scaler = cpi->ppi->use_svc
+                         ? svc->downsample_filter_phase[svc->spatial_layer_id]
+                         : 0;
 
   set_size_independent_vars(cpi);
   av1_setup_frame_size(cpi);
   av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-  if (!cpi->use_svc) {
+  if (!cpi->ppi->use_svc) {
     phase_scaler = 8;
     // 2:1 scaling.
     if ((cm->width << 1) == unscaled->y_crop_width &&
@@ -2315,6 +2304,12 @@ static int encode_without_recode(AV1_COMP *cpi) {
   printf("\n Encoding a frame:");
 #endif
 
+#if CONFIG_TUNE_BUTTERAUGLI
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+    av1_setup_butteraugli_rdmult(cpi);
+  }
+#endif
+
   aom_clear_system_state();
 
   cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source,
@@ -2336,7 +2331,7 @@ static int encode_without_recode(AV1_COMP *cpi) {
   }
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc)
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc)
     av1_denoiser_reset_on_first_frame(cpi);
 #endif
 
@@ -2365,7 +2360,7 @@ static int encode_without_recode(AV1_COMP *cpi) {
   // (zero_mode is forced), and since the scaled references are only
   // use for newmv search, we can avoid scaling here.
   if (!frame_is_intra_only(cm) &&
-      !(cpi->use_svc && cpi->svc.force_zero_mode_spatial_ref))
+      !(cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref))
     av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
 
   av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
@@ -2373,7 +2368,7 @@ static int encode_without_recode(AV1_COMP *cpi) {
   av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
   if ((q_cfg->deltaq_mode != NO_DELTA_Q) || q_cfg->enable_chroma_deltaq)
     av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                       cm->seq_params.bit_depth);
+                       cm->seq_params->bit_depth);
   av1_set_variance_partition_thresholds(cpi, q, 0);
   av1_setup_frame(cpi);
 
@@ -2388,7 +2383,7 @@ static int encode_without_recode(AV1_COMP *cpi) {
       av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
       if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
         av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                           cm->seq_params.bit_depth);
+                           cm->seq_params->bit_depth);
       av1_set_variance_partition_thresholds(cpi, q, 0);
       if (frame_is_intra_only(cm) || cm->features.error_resilient_mode)
         av1_setup_frame(cpi);
@@ -2432,7 +2427,7 @@ static int encode_without_recode(AV1_COMP *cpi) {
   end_timing(cpi, av1_encode_frame_time);
 #endif
 #if CONFIG_INTERNAL_STATS
-  ++cpi->tot_recode_hits;
+  ++cpi->frame_recode_hits;
 #endif
 
   aom_clear_system_state();
@@ -2504,7 +2499,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   if (!cpi->sf.hl_sf.disable_extra_sc_testing)
     av1_determine_sc_tools_with_encoding(cpi, q);
 
-#if CONFIG_USE_VMAF_RC
+#if CONFIG_TUNE_VMAF
   if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
     av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source);
   }
@@ -2525,6 +2520,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 
   do {
     loop = 0;
+    int do_mv_stats_collection = 1;
     aom_clear_system_state();
 
     // if frame was scaled calculate global_motion_search again if already
@@ -2580,7 +2576,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 
     if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
+                         cm->seq_params->bit_depth);
 
     av1_set_variance_partition_thresholds(cpi, q, 0);
 
@@ -2636,14 +2632,19 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
     // transform / motion compensation build reconstruction frame
     av1_encode_frame(cpi);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Disable mv_stats collection for parallel frames based on update flag.
+    if (!cpi->do_frame_data_update) do_mv_stats_collection = 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     // Reset the mv_stats in case we are interrupted by an intraframe or an
     // overlay frame.
-    if (cpi->mv_stats.valid) {
-      av1_zero(cpi->mv_stats);
+    if (cpi->ppi->mv_stats.valid && do_mv_stats_collection) {
+      av1_zero(cpi->ppi->mv_stats);
     }
     // Gather the mv_stats for the next frame
     if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
-        av1_frame_allows_smart_mv(cpi)) {
+        av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) {
       av1_collect_mv_stats(cpi, q);
     }
 
@@ -2653,6 +2654,9 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 
     aom_clear_system_state();
 
+#if CONFIG_BITRATE_ACCURACY
+    const int do_dummy_pack = 1;
+#else   // CONFIG_BITRATE_ACCURACY
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
@@ -2660,6 +2664,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
         (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
          oxcf->rc_cfg.mode != AOM_Q) ||
         oxcf->rc_cfg.min_cr > 0;
+#endif  // CONFIG_BITRATE_ACCURACY
     if (do_dummy_pack) {
       av1_finalize_encoded_frame(cpi);
       int largest_tile_id = 0;  // Output from bitstream: unused here
@@ -2669,7 +2674,15 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
         return AOM_CODEC_ERROR;
       }
 
+      // bits used for this frame
       rc->projected_frame_size = (int)(*size) << 3;
+
+#if CONFIG_BITRATE_ACCURACY
+      cpi->ppi->tpl_data.actual_gop_bitrate += rc->projected_frame_size;
+      printf("\nframe: %d, projected frame size: %d, total: %f\n",
+             cpi->gf_frame_index, rc->projected_frame_size,
+             cpi->ppi->tpl_data.actual_gop_bitrate);
+#endif
     }
 
 #if CONFIG_TUNE_VMAF
@@ -2688,15 +2701,19 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 #if CONFIG_TUNE_BUTTERAUGLI
     if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
       loop = 1;
-      av1_restore_butteraugli_source(cpi);
+      av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4);
     }
 #endif
 
+#if CONFIG_BITRATE_ACCURACY
+    loop = 0;  // turn off recode loop when CONFIG_BITRATE_ACCURACY is on
+#endif         // CONFIG_BITRATE_ACCURACY
+
     if (loop) {
       ++loop_count;
 
 #if CONFIG_INTERNAL_STATS
-      ++cpi->tot_recode_hits;
+      ++cpi->frame_recode_hits;
 #endif
     }
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -2796,12 +2813,12 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
 #endif
 
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
-  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+  if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_AV1_HIGHBITDEPTH
     if (seq_params->use_highbitdepth) {
       cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
@@ -2884,7 +2901,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
                                             uint8_t *dest,
                                             int *largest_tile_id) {
   const AV1_COMMON *const cm = &cpi->common;
-  assert(cm->seq_params.enable_superres);
+  assert(cm->seq_params->enable_superres);
   assert(av1_superres_in_recode_allowed(cpi));
   aom_codec_err_t err = AOM_CODEC_OK;
   av1_save_all_coding_context(cpi);
@@ -2904,9 +2921,9 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
     int64_t superres_rates[SCALE_NUMERATOR];
     int superres_largest_tile_ids[SCALE_NUMERATOR];
     // Use superres for Key-frames and Alt-ref frames only.
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE &&
-        gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) {
+    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    if (gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE &&
+        gf_group->update_type[cpi->gf_frame_index] != INTNL_OVERLAY_UPDATE) {
       for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
            ++denom) {
         superres_cfg->superres_scale_denominator = denom;
@@ -2952,7 +2969,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
       const int64_t this_rate = superres_rates[this_index];
       const int this_largest_tile_id = superres_largest_tile_ids[this_index];
       const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-          rdmult, this_rate, this_sse, cm->seq_params.bit_depth);
+          rdmult, this_rate, this_sse, cm->seq_params->bit_depth);
       if (this_rdcost < proj_rdcost1) {
         sse1 = this_sse;
         rate1 = this_rate;
@@ -2962,7 +2979,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
       }
     }
     const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        rdmult, rate2, sse2, cm->seq_params.bit_depth);
+        rdmult, rate2, sse2, cm->seq_params->bit_depth);
     // Re-encode with superres if it's better.
     if (proj_rdcost1 < proj_rdcost2) {
       restore_all_coding_context(cpi);
@@ -3007,9 +3024,9 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
     const int64_t rdmult =
         av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
     proj_rdcost1 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1,
-                                                  cm->seq_params.bit_depth);
+                                                  cm->seq_params->bit_depth);
     const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        rdmult, rate2, sse2, cm->seq_params.bit_depth);
+        rdmult, rate2, sse2, cm->seq_params->bit_depth);
     // Re-encode with superres if it's better.
     if (proj_rdcost1 < proj_rdcost2) {
       restore_all_coding_context(cpi);
@@ -3034,6 +3051,42 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
   return err;
 }
 
+#if !CONFIG_REALTIME_ONLY
+static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const FIRSTPASS_STATS *const total_stats =
+      twopass->stats_buf_ctx->total_stats;
+
+  if (is_one_pass_rt_params(cpi) ||
+      (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) ||
+      (is_fp_wavelet_energy_invalid(total_stats) == 0))
+    return;
+
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source;
+  const uint8_t *const src = unfiltered_source->y_buffer;
+  const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = unfiltered_source->y_stride;
+  const BLOCK_SIZE fp_block_size =
+      get_fp_block_size(cpi->is_screen_content_type);
+  const int fp_block_size_width = block_size_wide[fp_block_size];
+  const int fp_block_size_height = block_size_high[fp_block_size];
+  const int num_unit_cols =
+      get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+  const int num_unit_rows =
+      get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height);
+  const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+  const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+  int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input(
+      src, stride, hbd, num_8x8_rows, num_8x8_cols);
+
+  twopass->frame_avg_haar_energy =
+      log(((double)frame_avg_wavelet_energy / num_mbs) + 1.0);
+}
+#endif
+
 extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
                                      const char *filename);
 
@@ -3055,7 +3108,7 @@ extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                      uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
@@ -3070,6 +3123,10 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     av1_set_screen_content_options(cpi, features);
   }
 
+#if !CONFIG_REALTIME_ONLY
+  calculate_frame_avg_haar_energy(cpi);
+#endif
+
   // frame type has been decided outside of this function call
   cm->cur_frame->frame_type = current_frame->frame_type;
 
@@ -3088,7 +3145,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   cpi->last_frame_type = current_frame->frame_type;
 
   if (frame_is_sframe(cm)) {
-    GF_GROUP *gf_group = &cpi->gf_group;
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
     // S frame will wipe out any previously encoded altref so we cannot place
     // an overlay frame
     gf_group->update_type[gf_group->size] = GF_UPDATE;
@@ -3110,7 +3167,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
         cm->ref_frame_id[i] = display_frame_id;
     }
 
-    cpi->seq_params_locked = 1;
+    cpi->ppi->seq_params_locked = 1;
 
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
@@ -3147,7 +3204,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   if (!is_stat_generation_stage(cpi) &&
       cpi->common.features.allow_screen_content_tools &&
       !frame_is_intra_only(cm)) {
-    if (cpi->common.seq_params.force_integer_mv == 2) {
+    if (cpi->common.seq_params->force_integer_mv == 2) {
       // Adaptive mode: see what previous frame encoded did
       if (cpi->unscaled_last_source != NULL) {
         features->cur_frame_force_integer_mv = av1_is_integer_mv(
@@ -3157,7 +3214,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       }
     } else {
       cpi->common.features.cur_frame_force_integer_mv =
-          cpi->common.seq_params.force_integer_mv;
+          cpi->common.seq_params->force_integer_mv;
     }
   } else {
     cpi->common.features.cur_frame_force_integer_mv = 0;
@@ -3290,7 +3347,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cpi->superres_mode = orig_superres_mode;  // restore
   }
 
-  cpi->seq_params_locked = 1;
+  cpi->ppi->seq_params_locked = 1;
 
   // Update reference frame ids for reference frames this frame will overwrite
   if (seq_params->frame_id_numbers_present_flag) {
@@ -3332,10 +3389,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
   refresh_reference_frames(cpi);
 
-#if CONFIG_ENTROPY_STATS
-  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
-#endif  // CONFIG_ENTROPY_STATS
-
   if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
     *cm->fc = cpi->tile_data[largest_tile_id].tctx;
     av1_reset_cdf_symbol_counters(cm->fc);
@@ -3417,7 +3470,13 @@ int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
 
   current_frame->display_order_hint = current_frame->order_hint;
   current_frame->order_hint %=
-      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
+      (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1));
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  current_frame->pyramid_level = get_true_pyr_level(
+      cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
+      current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
   if (is_stat_generation_stage(cpi)) {
 #if !CONFIG_REALTIME_ONLY
@@ -3442,9 +3501,9 @@ static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
   AV1_COMMON *const cm = &cpi->common;
   if (!cpi->denoise_and_model) {
     cpi->denoise_and_model = aom_denoise_and_model_alloc(
-        cm->seq_params.bit_depth, block_size, noise_level);
+        cm->seq_params->bit_depth, block_size, noise_level);
     if (!cpi->denoise_and_model) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Error allocating denoise and model");
       return -1;
     }
@@ -3452,7 +3511,7 @@ static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
   if (!cpi->film_grain_table) {
     cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
     if (!cpi->film_grain_table) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Error allocating grain table");
       return -1;
     }
@@ -3474,7 +3533,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
@@ -3516,7 +3575,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
     res = -1;
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&timer);
-  cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+  cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
 #endif
 
   // Note: Regarding profile setting, the following checks are added to help
@@ -3528,20 +3587,20 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   // header.
   if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Non-4:2:0 color format requires profile 1 or 2");
     res = -1;
   }
   if ((seq_params->profile == PROFILE_1) &&
       !(subsampling_x == 0 && subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 1 requires 4:4:4 color format");
     res = -1;
   }
   if ((seq_params->profile == PROFILE_2) &&
       (seq_params->bit_depth <= AOM_BITS_10) &&
       !(subsampling_x == 1 && subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
     res = -1;
   }
@@ -3549,6 +3608,20 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   return res;
 }
 
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi) {
+  if (!ppi->cpi) return;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    fprintf(stderr, "Writing counts.stt\n");
+    FILE *f = fopen("counts.stt", "wb");
+    fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f);
+    fclose(f);
+  }
+}
+#endif  // CONFIG_ENTROPY_STATS
+
 #if CONFIG_INTERNAL_STATS
 extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
                                  const unsigned char *img2, int img2_pitch,
@@ -3564,11 +3637,16 @@ static void adjust_image_stat(double y, double u, double v, double all,
 }
 
 static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
   AV1_COMMON *const cm = &cpi->common;
   double samples = 0.0;
   const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
 
+  if (cpi->ppi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    return;
+
 #if CONFIG_INTER_STATS_ONLY
   if (cm->current_frame.frame_type == KEY_FRAME) return;  // skip key frame
 #endif
@@ -3578,9 +3656,9 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
     const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
     double y, u, v, frame_all;
 
-    cpi->count[0]++;
-    cpi->count[1]++;
-    if (cpi->b_calculate_psnr) {
+    ppi->count[0]++;
+    ppi->count[1]++;
+    if (cpi->ppi->b_calculate_psnr) {
       PSNR_STATS psnr;
       double weight[2] = { 0.0, 0.0 };
       double frame_ssim2[2] = { 0.0, 0.0 };
@@ -3591,34 +3669,30 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       aom_calc_psnr(orig, recon, &psnr);
 #endif
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
-                        &(cpi->psnr[0]));
-      cpi->total_sq_error[0] += psnr.sse[0];
-      cpi->total_samples[0] += psnr.samples[0];
+                        &(ppi->psnr[0]));
+      ppi->total_sq_error[0] += psnr.sse[0];
+      ppi->total_samples[0] += psnr.samples[0];
       samples = psnr.samples[0];
 
-      // TODO(yaowu): unify these two versions into one.
-      if (cm->seq_params.use_highbitdepth)
-        aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
-                             frame_ssim2);
-      else
-        aom_calc_ssim(orig, recon, &weight[0], &frame_ssim2[0]);
+      aom_calc_ssim(orig, recon, bit_depth, in_bit_depth,
+                    cm->seq_params->use_highbitdepth, weight, frame_ssim2);
 
-      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2[0]);
-      cpi->summed_quality += frame_ssim2[0] * weight[0];
-      cpi->summed_weights += weight[0];
+      ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]);
+      ppi->summed_quality += frame_ssim2[0] * weight[0];
+      ppi->summed_weights += weight[0];
 
 #if CONFIG_AV1_HIGHBITDEPTH
       // Compute PSNR based on stream bit depth
       if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
           (in_bit_depth < bit_depth)) {
         adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
-                          psnr.psnr_hbd[0], &cpi->psnr[1]);
-        cpi->total_sq_error[1] += psnr.sse_hbd[0];
-        cpi->total_samples[1] += psnr.samples_hbd[0];
+                          psnr.psnr_hbd[0], &ppi->psnr[1]);
+        ppi->total_sq_error[1] += psnr.sse_hbd[0];
+        ppi->total_samples[1] += psnr.samples_hbd[0];
 
-        cpi->worst_ssim_hbd = AOMMIN(cpi->worst_ssim_hbd, frame_ssim2[1]);
-        cpi->summed_quality_hbd += frame_ssim2[1] * weight[1];
-        cpi->summed_weights_hbd += weight[1];
+        ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]);
+        ppi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+        ppi->summed_weights_hbd += weight[1];
       }
 #endif
 
@@ -3636,48 +3710,207 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       }
 #endif
     }
-    if (cpi->b_calculate_blockiness) {
-      if (!cm->seq_params.use_highbitdepth) {
+    if (ppi->b_calculate_blockiness) {
+      if (!cm->seq_params->use_highbitdepth) {
         const double frame_blockiness =
             av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
                                recon->y_stride, orig->y_width, orig->y_height);
-        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
-        cpi->total_blockiness += frame_blockiness;
+        ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness);
+        ppi->total_blockiness += frame_blockiness;
       }
 
-      if (cpi->b_calculate_consistency) {
-        if (!cm->seq_params.use_highbitdepth) {
+      if (ppi->b_calculate_consistency) {
+        if (!cm->seq_params->use_highbitdepth) {
           const double this_inconsistency = aom_get_ssim_metrics(
               orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
-              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+              orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1);
 
           const double peak = (double)((1 << in_bit_depth) - 1);
           const double consistency =
-              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+              aom_sse_to_psnr(samples, peak, ppi->total_inconsistency);
           if (consistency > 0.0)
-            cpi->worst_consistency =
-                AOMMIN(cpi->worst_consistency, consistency);
-          cpi->total_inconsistency += this_inconsistency;
+            ppi->worst_consistency =
+                AOMMIN(ppi->worst_consistency, consistency);
+          ppi->total_inconsistency += this_inconsistency;
         }
       }
     }
 
     frame_all =
         aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
-    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    adjust_image_stat(y, u, v, frame_all, &ppi->fastssim);
     frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
-    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+    adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs);
+  }
+}
+
+void print_internal_stats(AV1_PRIMARY *const ppi) {
+  if (!ppi->cpi) return;
+  AV1_COMP *const cpi = ppi->cpi;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    aom_clear_system_state();
+    char headings[512] = { 0 };
+    char results[512] = { 0 };
+    FILE *f = fopen("opsnr.stt", "a");
+    double time_encoded =
+        (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
+        10000000.000;
+    double total_encode_time =
+        (ppi->total_time_receive_data + ppi->total_time_compress_data) /
+        1000.000;
+    const double dr =
+        (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded;
+    const double peak =
+        (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1);
+    const double target_rate =
+        (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000;
+    const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+    if (ppi->b_calculate_psnr) {
+      const double total_psnr = aom_sse_to_psnr(
+          (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]);
+      const double total_ssim =
+          100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0);
+      snprintf(headings, sizeof(headings),
+               "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+               "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+               "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+               "AVPsrnY\tAPsnrCb\tAPsnrCr");
+      snprintf(results, sizeof(results),
+               "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f",
+               dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               total_ssim, total_ssim,
+               ppi->fastssim.stat[STAT_ALL] / ppi->count[0],
+               ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst,
+               ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst,
+               ppi->psnr[0].stat[STAT_Y] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_U] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_V] / ppi->count[0]);
+
+      if (ppi->b_calculate_blockiness) {
+        SNPRINT(headings, "\t  Block\tWstBlck");
+        SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness);
+      }
+
+      if (ppi->b_calculate_consistency) {
+        double consistency =
+            aom_sse_to_psnr((double)ppi->total_samples[0], peak,
+                            (double)ppi->total_inconsistency);
+
+        SNPRINT(headings, "\tConsist\tWstCons");
+        SNPRINT2(results, "\t%7.3f", consistency);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_consistency);
+      }
+
+      SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
+      SNPRINT2(results, "\t%8.0f", total_encode_time);
+      SNPRINT2(results, " %7.2f", rate_err);
+      SNPRINT2(results, " %7.2f", fabs(rate_err));
+
+      SNPRINT(headings, "\tAPsnr611");
+      SNPRINT2(results, " %7.3f",
+               (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] +
+                ppi->psnr[0].stat[STAT_V]) /
+                   (ppi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth;
+      const uint32_t bit_depth = ppi->seq_params.bit_depth;
+      // Since cpi->source->flags is not available here, but total_samples[1]
+      // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was
+      // true in compute_internal_stats
+      if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) {
+        const double peak_hbd = (double)((1 << bit_depth) - 1);
+        const double total_psnr_hbd =
+            aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd,
+                            (double)ppi->total_sq_error[1]);
+        const double total_ssim_hbd =
+            100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0);
+        SNPRINT(headings,
+                "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+                " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+                " AOMSSIMH VPSSIMPH WstSsimH");
+        SNPRINT2(results, "\t%7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].worst);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->worst_ssim_hbd);
+      }
+#endif
+      fprintf(f, "%s\n", headings);
+      fprintf(f, "%s\n", results);
+    }
+
+    fclose(f);
+
+    if (ppi->ssim_vars != NULL) {
+      aom_free(ppi->ssim_vars);
+      ppi->ssim_vars = NULL;
+    }
   }
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+void av1_post_encode_updates(AV1_COMP *const cpi, size_t size,
+                             int64_t time_stamp, int64_t time_end) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
+  AV1_COMMON *const cm = &cpi->common;
+  // Note *size = 0 indicates a dropped frame for which psnr is not calculated
+  if (ppi->b_calculate_psnr && size > 0) {
+    if (cm->show_existing_frame ||
+        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+      generate_psnr_packet(cpi);
+    }
+  }
+
+  if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+      av1_init_level_info(cpi);
+    }
+    av1_update_level_info(cpi, size, time_stamp, time_end);
+  }
+
+#if CONFIG_INTERNAL_STATS
+  if (!is_stat_generation_stage(cpi)) {
+    compute_internal_stats(cpi, (int)size);
+  }
+#endif  // CONFIG_INTERNAL_STATS
+}
+
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
+                            size_t *size, size_t avail_size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush,
                             const aom_rational64_t *timestamp_ratio) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
 
+#if CONFIG_INTERNAL_STATS
+  cpi->frame_recode_hits = 0;
+  cpi->time_compress_data = 0;
+  cpi->bytes = 0;
+#endif
+#if CONFIG_ENTROPY_STATS
+  if (cpi->compressor_stage == ENCODE_STAGE) {
+    av1_zero(cpi->counts);
+  }
+#endif
+
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads <= 1 &&
          "bitstream debug tool does not support multithreading");
@@ -3685,12 +3918,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
                                       cm->show_frame);
 #endif
-  if (cpi->use_svc && cm->number_spatial_layers > 1) {
+  if (cpi->ppi->use_svc && cpi->ppi->number_spatial_layers > 1) {
     av1_one_pass_cbr_svc_start_layer(cpi);
   }
 
   cm->showable_frame = 0;
   *size = 0;
+  cpi->available_bs_size = avail_size;
 #if CONFIG_INTERNAL_STATS
   struct aom_usec_timer cmptimer;
   aom_usec_timer_start(&cmptimer);
@@ -3763,27 +3997,9 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   aom_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
 #endif  // CONFIG_INTERNAL_STATS
-  // Note *size = 0 indicates a dropped frame for which psnr is not calculated
-  if (cpi->b_calculate_psnr && *size > 0) {
-    if (cm->show_existing_frame ||
-        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
-      generate_psnr_packet(cpi);
-    }
-  }
 
-  if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
-    // Initialize level info. at the beginning of each sequence.
-    if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
-      av1_init_level_info(cpi);
-    }
-    av1_update_level_info(cpi, *size, *time_stamp, *time_end);
-  }
+  av1_post_encode_updates(cpi, *size, *time_stamp, *time_end);
 
-#if CONFIG_INTERNAL_STATS
-  if (!is_stat_generation_stage(cpi)) {
-    compute_internal_stats(cpi, (int)(*size));
-  }
-#endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
   if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
     cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
@@ -3806,8 +4022,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
       *dest = cm->cur_frame->buf;
       dest->y_width = cm->width;
       dest->y_height = cm->height;
-      dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
-      dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
+      dest->uv_width = cm->width >> cm->seq_params->subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params->subsampling_y;
       ret = 0;
     } else {
       ret = -1;
@@ -3829,12 +4045,12 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
                                        YV12_BUFFER_CONFIG *sd) {
   const int num_planes = av1_num_planes(cm);
   if (!equal_dimensions_and_border(new_frame, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(new_frame, sd, num_planes);
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 int av1_set_internal_size(AV1EncoderConfig *const oxcf,
@@ -3919,7 +4135,7 @@ int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
   return AOM_CODEC_OK;
 }
 
-static void svc_set_updates_external_ref_frame_config(
+static void svc_set_updates_ref_frame_config(
     ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, SVC *const svc) {
   ext_refresh_frame_flags->update_pending = 1;
   ext_refresh_frame_flags->last_frame = svc->refresh[svc->ref_idx[0]];
@@ -3980,7 +4196,7 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
 
     av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
   } else {
-    if (cpi->svc.external_ref_frame_config) {
+    if (cpi->svc.set_ref_frame_config) {
       int ref = svc_set_references_external_ref_frame_config(cpi);
       av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
     }
@@ -4008,9 +4224,8 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
     ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
     ext_refresh_frame_flags->update_pending = 1;
   } else {
-    if (cpi->svc.external_ref_frame_config)
-      svc_set_updates_external_ref_frame_config(ext_refresh_frame_flags,
-                                                &cpi->svc);
+    if (cpi->svc.set_ref_frame_config)
+      svc_set_updates_ref_frame_config(ext_refresh_frame_flags, &cpi->svc);
     else
       ext_refresh_frame_flags->update_pending = 0;
   }
@@ -4030,12 +4245,12 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
   }
 }
 
-aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
-  if (!cpi) return NULL;
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
+  if (!ppi) return NULL;
 
   uint8_t header_buf[512] = { 0 };
   const uint32_t sequence_header_size =
-      av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]);
+      av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]);
   assert(sequence_header_size <= sizeof(header_buf));
   if (sequence_header_size == 0) return NULL;
 
@@ -4046,7 +4261,8 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
   if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
   memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
 
-  if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0,
+  if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
+                           OBU_SEQUENCE_HEADER, 0,
                            &header_buf[0]) != obu_header_size) {
     return NULL;
   }
diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder.h b/third_party/libaom/source/libaom/av1/encoder/encoder.h
index 905470f437..fe6e76f498 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encoder.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encoder.h
@@ -35,6 +35,7 @@
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
+#include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/global_motion.h"
 #include "av1/encoder/level.h"
@@ -49,6 +50,7 @@
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/bitstream.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -119,6 +121,26 @@ enum {
   FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
 } UENUM1BYTE(FRAMETYPE_FLAGS);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// 0 level frames are sometimes used for rate control purposes, but for
+// reference mapping purposes, the minimum level should be 1.
+#define MIN_PYR_LEVEL 1
+static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+                                     int max_layer_depth) {
+  if (frame_order == 0) {
+    // Keyframe case
+    return MIN_PYR_LEVEL;
+  } else if (frame_level == MAX_ARF_LAYERS) {
+    // Leaves
+    return max_layer_depth;
+  } else if (frame_level == (MAX_ARF_LAYERS + 1)) {
+    // Altrefs
+    return MIN_PYR_LEVEL;
+  }
+  return AOMMAX(MIN_PYR_LEVEL, frame_level);
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 enum {
   NO_AQ = 0,
   VARIANCE_AQ = 1,
@@ -159,13 +181,6 @@ enum {
 /*!\cond */
 
 typedef enum {
-  COST_UPD_SB,
-  COST_UPD_SBROW,
-  COST_UPD_TILE,
-  COST_UPD_OFF,
-} COST_UPDATE_TYPE;
-
-typedef enum {
   MOD_FP,           // First pass
   MOD_TF,           // Temporal filtering
   MOD_TPL,          // TPL
@@ -173,12 +188,24 @@ typedef enum {
   MOD_ENC,          // Encode stage
   MOD_LPF,          // Deblocking loop filter
   MOD_CDEF_SEARCH,  // CDEF search
+  MOD_CDEF,         // CDEF frame
   MOD_LR,           // Loop restoration filtering
+  MOD_PACK_BS,      // Pack bitstream
   NUM_MT_MODULES
 } MULTI_THREADED_MODULES;
 
 /*!\endcond */
 
+/*!\enum COST_UPDATE_TYPE
+ * \brief This enum controls how often the entropy costs should be updated.
+ */
+typedef enum {
+  COST_UPD_SB,    /*!< Update every sb. */
+  COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */
+  COST_UPD_TILE,  /*!< Update every tile. */
+  COST_UPD_OFF,   /*!< Turn off cost updates. */
+} COST_UPDATE_TYPE;
+
 /*!
  * \brief Encoder config related to resize.
  */
@@ -623,6 +650,8 @@ typedef struct {
   COST_UPDATE_TYPE mode;
   // Indicates the update frequency for mv costs.
   COST_UPDATE_TYPE mv;
+  // Indicates the update frequency for dv costs.
+  COST_UPDATE_TYPE dv;
 } CostUpdateFreq;
 
 typedef struct {
@@ -711,7 +740,10 @@ typedef struct {
  */
 typedef struct {
   /*!
-   * Indicates the loop filter sharpness.
+   * Controls the level at which rate-distortion optimization of transform
+   * coefficients favours sharpness in the block. Has no impact on RD when set
+   * to zero (default). For values 1-7, eob and skip block optimization are
+   * avoided and rdmult is adjusted in favour of block sharpness.
    */
   int sharpness;
 
@@ -940,6 +972,10 @@ typedef struct AV1EncoderConfig {
   // format.
   bool save_as_annexb;
 
+  // The path for partition stats reading and writing, used in the experiment
+  // CONFIG_PARTITION_SEARCH_ORDER.
+  const char *partition_info_path;
+
   /*!\endcond */
 } AV1EncoderConfig;
 
@@ -1267,6 +1303,7 @@ typedef struct TileDataEnc {
   TileInfo tile_info;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   FRAME_CONTEXT *row_ctx;
+  uint64_t abs_sum_level;
   uint8_t allow_update_cdf;
   InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
   AV1EncRowMultiThreadSync row_mt_sync;
@@ -1295,14 +1332,23 @@ typedef struct ThreadData {
   PALETTE_BUFFER *palette_buffer;
   CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
+  uint64_t abs_sum_level;
   uint8_t *tmp_pred_bufs[2];
   int intrabc_used;
   int deltaq_used;
+  int coefficient_size;
+  int max_mv_magnitude;
+  int interp_filter_selected[SWITCHABLE];
   FRAME_CONTEXT *tctx;
   VP64x64 *vt64x64;
   int32_t num_64x64_blocks;
   PICK_MODE_CONTEXT *firstpass_ctx;
   TemporalFilterData tf_data;
+  TplTxfmStats tpl_txfm_stats;
+  // Pointer to the array of structures to store gradient information of each
+  // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+  // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+  PixelLevelGradientInfo *pixel_gradient_info;
 } ThreadData;
 
 struct EncWorkerData;
@@ -1427,6 +1473,11 @@ typedef struct MultiThreadInfo {
   AV1LrSync lr_row_sync;
 
   /*!
+   * Pack bitstream multi-threading object.
+   */
+  AV1EncPackBSSync pack_bs_sync;
+
+  /*!
    * Global Motion multi-threading object.
    */
   AV1GlobalMotionSync gm_sync;
@@ -1440,6 +1491,11 @@ typedef struct MultiThreadInfo {
    * CDEF search multi-threading object.
    */
   AV1CdefSync cdef_sync;
+
+  /*!
+   * CDEF row multi-threading data.
+   */
+  AV1CdefWorkerData *cdef_worker;
 } MultiThreadInfo;
 
 /*!\cond */
@@ -1561,10 +1617,13 @@ enum {
   rd_pick_sb_modes_time,
   av1_rd_pick_intra_mode_sb_time,
   av1_rd_pick_inter_mode_sb_time,
+  set_params_rd_pick_inter_mode_time,
+  skip_inter_mode_time,
   handle_inter_mode_time,
   evaluate_motion_mode_for_winner_candidates_time,
-  handle_intra_mode_time,
   do_tx_search_time,
+  handle_intra_mode_time,
+  refine_winner_mode_tx_time,
   av1_search_palette_mode_time,
   handle_newmv_time,
   compound_type_rd_time,
@@ -1609,11 +1668,15 @@ static INLINE char const *get_component_name(int index) {
       return "av1_rd_pick_intra_mode_sb_time";
     case av1_rd_pick_inter_mode_sb_time:
       return "av1_rd_pick_inter_mode_sb_time";
+    case set_params_rd_pick_inter_mode_time:
+      return "set_params_rd_pick_inter_mode_time";
+    case skip_inter_mode_time: return "skip_inter_mode_time";
     case handle_inter_mode_time: return "handle_inter_mode_time";
     case evaluate_motion_mode_for_winner_candidates_time:
       return "evaluate_motion_mode_for_winner_candidates_time";
-    case handle_intra_mode_time: return "handle_intra_mode_time";
     case do_tx_search_time: return "do_tx_search_time";
+    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time";
     case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
     case handle_newmv_time: return "handle_newmv_time";
     case compound_type_rd_time: return "compound_type_rd_time";
@@ -2045,12 +2108,88 @@ typedef struct {
   uint8_t *entropy_ctx;
 } CoeffBufferPool;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+/*!
+ * \brief Max number of frames that can be encoded in a parallel encode set.
+ */
+#define MAX_PARALLEL_FRAMES 4
+
+/*!
+ * \brief Structure to hold data of frame encoded in a given parallel encode
+ * set.
+ */
+typedef struct AV1_FP_OUT_DATA {
+  /*!
+   * Buffer to store packed bitstream data of a frame.
+   */
+  unsigned char *cx_data_frame;
+
+  /*!
+   * Allocated size of the cx_data_frame buffer.
+   */
+  size_t cx_data_sz;
+
+  /*!
+   * Size of data written in the cx_data_frame buffer.
+   */
+  size_t frame_size;
+
+  /*!
+   * Display order hint of frame whose packed data is in cx_data_frame buffer.
+   */
+  int frame_display_order_hint;
+} AV1_FP_OUT_DATA;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 /*!
  * \brief Top level primary encoder structure
  */
 typedef struct AV1_PRIMARY {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Array of frame level encoder stage top level structures
+   */
+  struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Number of frame level contexts(cpis)
+   */
+  int num_fp_contexts;
+
+  /*!
+   * Array of structures to hold data of frames encoded in a given parallel
+   * encode set.
+   */
+  struct AV1_FP_OUT_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1];
+
+  /*!
+   * Loopfilter levels of the previous encoded frame.
+   */
+  int filter_level[2];
+  int filter_level_u;
+  int filter_level_v;
+
+  /*!
+   * Largest MV component used in previous encoded frame during
+   * stats consumption stage.
+   */
+  int max_mv_magnitude;
+
+  /*!
+   * Temporary variable simulating the delayed frame_probability update.
+   */
+  FrameProbInfo temp_frame_probs;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * avg_frame_qindex.
+   */
+  int temp_avg_frame_qindex[FRAME_TYPES];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   /*!
    * Encode stage top level structure
+   * When CONFIG_FRAME_PARALLEL_ENCODE is enabled this is the same as
+   * parallel_cpi[0]
    */
   struct AV1_COMP *cpi;
 
@@ -2063,6 +2202,186 @@ typedef struct AV1_PRIMARY {
    * Look-ahead context.
    */
   struct lookahead_ctx *lookahead;
+
+  /*!
+   * Sequence parameters have been transmitted already and locked
+   * or not. Once locked av1_change_config cannot change the seq
+   * parameters.
+   */
+  int seq_params_locked;
+
+  /*!
+   * Pointer to internal utility functions that manipulate aom_codec_* data
+   * structures.
+   */
+  struct aom_codec_pkt_list *output_pkt_list;
+
+  /*!
+   * When set, indicates that internal ARFs are enabled.
+   */
+  int internal_altref_allowed;
+
+  /*!
+   * Information related to a gf group.
+   */
+  GF_GROUP gf_group;
+
+  /*!
+   * Track prior gf group state.
+   */
+  GF_STATE gf_state;
+
+  /*!
+   * Flag indicating whether look ahead processing (LAP) is enabled.
+   */
+  int lap_enabled;
+
+  /*!
+   * Parameters for AV1 bitstream levels.
+   */
+  AV1LevelParams level_params;
+
+  /*!
+   * Calculates PSNR on each frame when set to 1.
+   */
+  int b_calculate_psnr;
+
+  /*!
+   * Number of frames left to be encoded, is 0 if limit is not set.
+   */
+  int frames_left;
+
+  /*!
+   * Information related to two pass encoding.
+   */
+  TWO_PASS twopass;
+
+  /*!
+   * Rate control related parameters.
+   */
+  PRIMARY_RATE_CONTROL p_rc;
+
+  /*!
+   * Frame buffer holding the temporally filtered source frame. It can be KEY
+   * frame or ARF frame.
+   */
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * Indicates whether to use SVC.
+   */
+  int use_svc;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Function pointers to variants of sse/sad/variance computation functions.
+   * fn_ptr[i] indicates the list of function pointers corresponding to block
+   * size i.
+   */
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+  /*!
+   * Scaling factors used in the RD multiplier modulation.
+   * TODO(sdeng): consider merge the following arrays.
+   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+   * intermediate scaling factors which are used in the calculation of
+   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_rdmult_scaling_factors;
+
+  /*!
+   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_sb_rdmult_scaling_factors;
+
+  /*!
+   * Parameters related to tpl.
+   */
+  TplParams tpl_data;
+
+  /*!
+   * Motion vector stats of the previous encoded frame.
+   */
+  MV_STATS mv_stats;
+
+#if CONFIG_INTERNAL_STATS
+  /*!\cond */
+  uint64_t total_time_receive_data;
+  uint64_t total_time_compress_data;
+
+  unsigned int total_mode_chosen_counts[MAX_MODES];
+
+  int count[2];
+  uint64_t total_sq_error[2];
+  uint64_t total_samples[2];
+  ImageStat psnr[2];
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int total_bytes;
+  double summed_quality;
+  double summed_weights;
+  double summed_quality_hbd;
+  double summed_weights_hbd;
+  unsigned int total_recode_hits;
+  double worst_ssim;
+  double worst_ssim_hbd;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+  /*!\endcond */
+#endif
+
+#if CONFIG_ENTROPY_STATS
+  /*!
+   * Aggregates frame counts for the sequence.
+   */
+  FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+  /*!
+   * For each type of reference frame, this contains the index of a reference
+   * frame buffer for a reference frame of the same type.  We use this to
+   * choose our primary reference frame (which is the most recent reference
+   * frame of the same type as the current frame).
+   */
+  int fb_of_context_type[REF_FRAMES];
 } AV1_PRIMARY;
 
 /*!
@@ -2173,9 +2492,9 @@ typedef struct AV1_COMP {
   YV12_BUFFER_CONFIG *unfiltered_source;
 
   /*!
-   * Parameters related to tpl.
+   * Skip tpl setup when tpl data from gop length decision can be reused.
    */
-  TplParams tpl_data;
+  int skip_tpl_setup_stats;
 
   /*!
    * Temporal filter context.
@@ -2209,14 +2528,6 @@ typedef struct AV1_COMP {
   RefreshFrameFlagsInfo refresh_frame;
 
   /*!
-   * For each type of reference frame, this contains the index of a reference
-   * frame buffer for a reference frame of the same type.  We use this to
-   * choose our primary reference frame (which is the most recent reference
-   * frame of the same type as the current frame).
-   */
-  int fb_of_context_type[REF_FRAMES];
-
-  /*!
    * Flags signalled by the external interface at frame level.
    */
   ExternalFlags ext_flags;
@@ -2275,12 +2586,6 @@ typedef struct AV1_COMP {
   double framerate;
 
   /*!
-   * Pointer to internal utility functions that manipulate aom_codec_* data
-   * structures.
-   */
-  struct aom_codec_pkt_list *output_pkt_list;
-
-  /*!
    * Bitmask indicating which reference buffers may be referenced by this frame.
    */
   int ref_frame_flags;
@@ -2322,26 +2627,9 @@ typedef struct AV1_COMP {
   ActiveMap active_map;
 
   /*!
-   * Function pointers to variants of sse/sad/variance computation functions.
-   * fn_ptr[i] indicates the list of function pointers corresponding to block
-   * size i.
-   */
-  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
-
-  /*!
-   * Information related to two pass encoding.
-   */
-  TWO_PASS twopass;
-
-  /*!
-   * Information related to a gf group.
-   */
-  GF_GROUP gf_group;
-
-  /*!
-   * Track prior gf group state.
+   * The frame processing order within a GOP.
    */
-  GF_STATE gf_state;
+  unsigned char gf_frame_index;
 
   /*!
    * To control the reference frame buffer and selection.
@@ -2349,58 +2637,20 @@ typedef struct AV1_COMP {
   RefBufferStack ref_buffer_stack;
 
   /*!
-   * Frame buffer holding the temporally filtered source frame. It can be KEY
-   * frame or ARF frame.
-   */
-  YV12_BUFFER_CONFIG alt_ref_buffer;
-
-  /*!
    * Tell if OVERLAY frame shows existing alt_ref frame.
    */
   int show_existing_alt_ref;
 
 #if CONFIG_INTERNAL_STATS
   /*!\cond */
-  uint64_t time_receive_data;
   uint64_t time_compress_data;
 
   unsigned int mode_chosen_counts[MAX_MODES];
-
-  int count[2];
-  uint64_t total_sq_error[2];
-  uint64_t total_samples[2];
-  ImageStat psnr[2];
-
-  double total_blockiness;
-  double worst_blockiness;
-
   int bytes;
-  double summed_quality;
-  double summed_weights;
-  double summed_quality_hbd;
-  double summed_weights_hbd;
-  unsigned int tot_recode_hits;
-  double worst_ssim;
-  double worst_ssim_hbd;
-
-  ImageStat fastssim;
-  ImageStat psnrhvs;
-
-  int b_calculate_blockiness;
-  int b_calculate_consistency;
-
-  double total_inconsistency;
-  double worst_consistency;
-  Ssimv *ssim_vars;
-  Metrics metrics;
+  unsigned int frame_recode_hits;
   /*!\endcond */
 #endif
 
-  /*!
-   * Calculates PSNR on each frame when set to 1.
-   */
-  int b_calculate_psnr;
-
 #if CONFIG_SPEED_STATS
   /*!
    * For debugging: number of transform searches we have performed.
@@ -2458,13 +2708,6 @@ typedef struct AV1_COMP {
   TokenInfo token_info;
 
   /*!
-   * Sequence parameters have been transmitted already and locked
-   * or not. Once locked av1_change_config cannot change the seq
-   * parameters.
-   */
-  int seq_params_locked;
-
-  /*!
    * VARIANCE_AQ segment map refresh.
    */
   int vaq_refresh;
@@ -2492,21 +2735,11 @@ typedef struct AV1_COMP {
   int existing_fb_idx_to_show;
 
   /*!
-   * When set, indicates that internal ARFs are enabled.
-   */
-  int internal_altref_allowed;
-
-  /*!
    * A flag to indicate if intrabc is ever used in current frame.
    */
   int intrabc_used;
 
   /*!
-   * Tables to calculate IntraBC MV cost.
-   */
-  IntraBCMVCosts dv_costs;
-
-  /*!
    * Mark which ref frames can be skipped for encoding current frame during RDO.
    */
   int prune_ref_frame_mask;
@@ -2571,9 +2804,9 @@ typedef struct AV1_COMP {
 #endif
 
   /*!
-   * Parameters for AV1 bitstream levels.
+   * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
    */
-  AV1LevelParams level_params;
+  int frame_header_count;
 
   /*!
    * Whether any no-zero delta_q was actually used.
@@ -2586,20 +2819,6 @@ typedef struct AV1_COMP {
   RefFrameDistanceInfo ref_frame_dist_info;
 
   /*!
-   * Scaling factors used in the RD multiplier modulation.
-   * TODO(sdeng): consider merge the following arrays.
-   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
-   * intermediate scaling factors which are used in the calculation of
-   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
-   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
-   */
-  double *tpl_rdmult_scaling_factors;
-  /*!
-   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
-   * the ith 16 x 16 block in raster scan order.
-   */
-  double *tpl_sb_rdmult_scaling_factors;
-  /*!
    * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
    * the ith 16 x 16 block in raster scan order. This scaling factor is used for
    * RD multiplier modulation when SSIM tuning is enabled.
@@ -2621,30 +2840,16 @@ typedef struct AV1_COMP {
 #endif
 
   /*!
-   * Indicates whether to use SVC.
-   */
-  int use_svc;
-  /*!
    * Parameters for scalable video coding.
    */
   SVC svc;
 
   /*!
-   * Flag indicating whether look ahead processing (LAP) is enabled.
-   */
-  int lap_enabled;
-  /*!
    * Indicates whether current processing stage is encode stage or LAP stage.
    */
   COMPRESSOR_STAGE compressor_stage;
 
   /*!
-   * Some motion vector stats from the last encoded frame to help us decide what
-   * precision to use to encode the current frame.
-   */
-  MV_STATS mv_stats;
-
-  /*!
    * Frame type of the last frame. May be used in some heuristics for speeding
    * up the encoding.
    */
@@ -2686,14 +2891,35 @@ typedef struct AV1_COMP {
   uint8_t *consec_zero_mv;
 
   /*!
-   * Number of frames left to be encoded, is 0 if limit is not set.
+   * Block size of first pass encoding
    */
-  int frames_left;
+  BLOCK_SIZE fp_block_size;
 
   /*!
-   * Block size of first pass encoding
+   * The counter of encoded super block, used to differentiate block names.
+   * This number starts from 0 and increases whenever a super block is encoded.
    */
-  BLOCK_SIZE fp_block_size;
+  int sb_counter;
+
+  /*!
+   * Available bitstream buffer size in bytes
+   */
+  size_t available_bs_size;
+
+  /*!
+   * The controller of the external partition model.
+   * It is used to do partition type selection based on external models.
+   */
+  ExtPartController ext_part_controller;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * A flag to indicate frames that will update their data to the primary
+   * context at the end of the encode. It is set for non-parallel frames and the
+   * last frame in encode order in a given parallel encode set.
+   */
+  bool do_frame_data_update;
+#endif
 } AV1_COMP;
 
 /*!
@@ -2773,26 +2999,39 @@ void av1_initialize_enc(void);
 
 struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
                                        BufferPool *const pool,
-                                       FIRSTPASS_STATS *frame_stats_buf,
                                        COMPRESSOR_STAGE stage,
-                                       int num_lap_buffers,
-                                       int lap_lag_in_frames,
-                                       STATS_BUFFER_CTX *stats_buf_context);
+                                       int lap_lag_in_frames);
 
-struct AV1_PRIMARY *av1_create_primary_compressor();
+struct AV1_PRIMARY *av1_create_primary_compressor(
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    AV1EncoderConfig *oxcf);
 
 void av1_remove_compressor(AV1_COMP *cpi);
 
 void av1_remove_primary_compressor(AV1_PRIMARY *ppi);
 
-void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+void print_internal_stats(AV1_PRIMARY *ppi);
+#endif
+
+void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+                           bool *sb_size_changed);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool sb_size_changed);
 
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y);
 
-void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
                                const AV1EncoderConfig *oxcf, int use_svc);
 
+void av1_post_encode_updates(AV1_COMP *const cpi, size_t size,
+                             int64_t time_stamp, int64_t time_end);
+
 /*!\endcond */
 
 /*!\brief Obtain the raw frame data
@@ -2827,6 +3066,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
  * \param[in]    cpi         Top-level encoder structure
  * \param[in]    frame_flags Flags to decide how to encoding the frame
  * \param[in]    size        Bitstream size
+ * \param[in]    avail_size  Available bitstream buffer size
  * \param[in]    dest        Bitstream output
  * \param[out]   time_stamp  Time stamp of the frame
  * \param[out]   time_end    Time end
@@ -2840,8 +3080,8 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
  * \retval #AOM_CODEC_ERROR
  */
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
+                            size_t *size, size_t avail_size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush,
                             const aom_rational64_t *timebase);
 
 /*!\brief Run 1-pass/2-pass encoding
@@ -2902,6 +3142,71 @@ void av1_set_screen_content_options(struct AV1_COMP *cpi,
 
 void av1_update_frame_size(AV1_COMP *cpi);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+typedef struct {
+  int pyr_level;
+  int disp_order;
+} RefFrameMapPair;
+
+static INLINE void init_ref_map_pair(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+  if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+    memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+    return;
+  }
+  memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx];
+    if (ref_frame_map_pairs[map_idx].disp_order == -1) continue;
+    if (buf == NULL) {
+      ref_frame_map_pairs[map_idx].disp_order = -1;
+      ref_frame_map_pairs[map_idx].pyr_level = -1;
+      continue;
+    } else if (buf->ref_count > 1) {
+      // Once the keyframe is coded, the slots in ref_frame_map will all
+      // point to the same frame. In that case, all subsequent pointers
+      // matching the current are considered "free" slots. This will find
+      // the next occurance of the current pointer if ref_count indicates
+      // there are multiple instances of it and mark it as free.
+      for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
+        const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
+        if (buf2 == buf) {
+          ref_frame_map_pairs[idx2].disp_order = -1;
+          ref_frame_map_pairs[idx2].pyr_level = -1;
+        }
+      }
+    }
+    ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint;
+    ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
+  }
+}
+
+static AOM_INLINE void calc_frame_data_update_flag(
+    GF_GROUP *const gf_group, int gf_frame_index,
+    bool *const do_frame_data_update) {
+  *do_frame_data_update = true;
+  // Set the flag to false for all frames in a given parallel encode set except
+  // the last frame in the set with frame_parallel_level = 2.
+  if (gf_group->frame_parallel_level[gf_frame_index] == 1) {
+    *do_frame_data_update = false;
+  } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) {
+    // Check if this is the last frame in the set with frame_parallel_level = 2.
+    for (int i = gf_frame_index + 1; i < gf_group->size; i++) {
+      if ((gf_group->frame_parallel_level[i] == 0 &&
+           (gf_group->update_type[i] == ARF_UPDATE ||
+            gf_group->update_type[i] == INTNL_ARF_UPDATE)) ||
+          gf_group->frame_parallel_level[i] == 1) {
+        break;
+      } else if (gf_group->frame_parallel_level[i] == 2) {
+        *do_frame_data_update = false;
+        break;
+      }
+    }
+  }
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 // TODO(jingning): Move these functions as primitive members for the new cpi
 // class.
 static INLINE void stack_push(int *stack, int *stack_size, int item) {
@@ -2949,8 +3254,9 @@ ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
 }
 
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
 
   return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
          update_type == GF_UPDATE;
@@ -3009,10 +3315,25 @@ static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
   return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
 }
 
+static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+  return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
+         (gf_cfg->gf_min_pyr_height == 0);
+}
+
+static AOM_INLINE int use_ml_model_to_decide_flat_gop(
+    const RateControlCfg *rc_cfg) {
+  return (rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
 // Check if statistics generation stage
 static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
   assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
-                 cpi->oxcf.pass == 0 && cpi->lap_enabled));
+                 cpi->oxcf.pass == 0 && cpi->ppi->lap_enabled));
   return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE));
 }
 // Check if statistics consumption stage
@@ -3024,7 +3345,7 @@ static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
 static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
   return (is_stat_consumption_stage_twopass(cpi) ||
           (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) &&
-           cpi->lap_enabled));
+           cpi->ppi->lap_enabled));
 }
 
 /*!\endcond */
@@ -3037,11 +3358,18 @@ static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
  * \return 0 if no stats for current stage else 1
  */
 static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
-  assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
-  return (cpi->oxcf.pass == 0 && !cpi->lap_enabled);
+  assert(
+      IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+  return (cpi->oxcf.pass == 0 && !cpi->ppi->lap_enabled);
 }
+
 /*!\cond */
 
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+  return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+         cpi->oxcf.gf_cfg.lag_in_frames == 0;
+}
+
 // Function return size of frame stats buffer
 static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
   /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
@@ -3208,7 +3536,7 @@ static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
 // Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
 // the obu_has_size_field bit is set, and the buffer contains the obu_size
 // field.
-aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
 
 #define MAX_GFUBOOST_FACTOR 10.0
 #define MIN_GFUBOOST_FACTOR 4.0
@@ -3229,9 +3557,9 @@ static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
 }
 
 // Get update type of the current frame.
-static INLINE FRAME_UPDATE_TYPE
-get_frame_update_type(const GF_GROUP *gf_group) {
-  return gf_group->update_type[gf_group->index];
+static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
+                                                      int gf_frame_index) {
+  return gf_group->update_type[gf_frame_index];
 }
 
 static INLINE int av1_pixels_to_mi(int pixels) {
@@ -3241,14 +3569,15 @@ static INLINE int av1_pixels_to_mi(int pixels) {
 static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
 
-  return cpi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+  return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
          cm->show_frame;
 }
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
-  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
-                                                cpi->svc.first_layer_denoise));
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
+           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
 }
 #endif
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h b/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h
index eae34e0fe6..6eb44e7ee1 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h
@@ -56,7 +56,7 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
   TokenInfo *token_info = &cpi->token_info;
 
   if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate context buffers");
   }
 
@@ -78,6 +78,13 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
   CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs,
                   (MvCosts *)aom_calloc(1, sizeof(MvCosts)));
 
+  if (cpi->td.mb.dv_costs) {
+    aom_free(cpi->td.mb.dv_costs);
+    cpi->td.mb.dv_costs = NULL;
+  }
+  CHECK_MEM_ERROR(cm, cpi->td.mb.dv_costs,
+                  (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.mb.dv_costs)));
+
   av1_setup_shared_coeff_buffer(&cpi->common, &cpi->td.shared_coeff_buf);
   av1_setup_sms_tree(cpi, &cpi->td);
   cpi->td.firstpass_ctx =
@@ -186,19 +193,10 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->ssim_rdmult_scaling_factors);
   cpi->ssim_rdmult_scaling_factors = NULL;
 
-  aom_free(cpi->tpl_rdmult_scaling_factors);
-  cpi->tpl_rdmult_scaling_factors = NULL;
-
-  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
-  cpi->tpl_sb_rdmult_scaling_factors = NULL;
-
 #if CONFIG_TUNE_VMAF
   aom_free(cpi->vmaf_info.rdmult_scaling_factors);
   cpi->vmaf_info.rdmult_scaling_factors = NULL;
-
-#if CONFIG_USE_VMAF_RC
-  aom_close_vmaf_model_rc(cpi->vmaf_info.vmaf_model);
-#endif
+  aom_close_vmaf_model(cpi->vmaf_info.vmaf_model);
 #endif
 
 #if CONFIG_TUNE_BUTTERAUGLI
@@ -215,6 +213,11 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
     cpi->td.mb.mv_costs = NULL;
   }
 
+  if (cpi->td.mb.dv_costs) {
+    aom_free(cpi->td.mb.dv_costs);
+    cpi->td.mb.dv_costs = NULL;
+  }
+
   aom_free(cpi->td.mb.inter_modes_info);
   cpi->td.mb.inter_modes_info = NULL;
 
@@ -235,7 +238,6 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
   cpi->td.firstpass_ctx = NULL;
 
-  av1_free_ref_frame_buffers(cm->buffer_pool);
   av1_free_txb_buf(cpi);
   av1_free_context_buffers(cm);
 
@@ -243,10 +245,15 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
 #if !CONFIG_REALTIME_ONLY
   av1_free_restoration_buffers(cm);
 #endif
+
+  if (!is_stat_generation_stage(cpi))
+    av1_free_cdef_buffers(cm, &cpi->mt_info.cdef_worker,
+                          &cpi->mt_info.cdef_sync,
+                          cpi->mt_info.num_mod_workers[MOD_CDEF]);
+
   aom_free_frame_buffer(&cpi->trial_frame_rst);
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
-  aom_free_frame_buffer(&cpi->alt_ref_buffer);
 
   free_token_info(token_info);
 
@@ -259,6 +266,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   for (int j = 0; j < 2; ++j) {
     aom_free(cpi->td.mb.tmp_pred_bufs[j]);
   }
+  aom_free(cpi->td.mb.pixel_gradient_info);
 
 #if CONFIG_DENOISE
   if (cpi->denoise_and_model) {
@@ -271,11 +279,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
     cpi->film_grain_table = NULL;
   }
 
-  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    aom_free(cpi->level_params.level_info[i]);
-  }
-
-  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+  if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
 
   if (cpi->consec_zero_mv) {
     aom_free(cpi->consec_zero_mv);
@@ -285,7 +289,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
 
 static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int num_64x64_blocks = (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+  const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
   if (cpi->td.vt64x64) {
     if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
       aom_free(cpi->td.vt64x64);
@@ -301,7 +305,7 @@ static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
 
 static AOM_INLINE void alloc_altref_frame_buffer(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
   // When lag_in_frames <= 1, alt-ref frames are not enabled. In this case,
@@ -311,29 +315,29 @@ static AOM_INLINE void alloc_altref_frame_buffer(AV1_COMP *cpi) {
 
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (aom_realloc_frame_buffer(
-          &cpi->alt_ref_buffer, oxcf->frm_dim_cfg.width,
+          &cpi->ppi->alt_ref_buffer, oxcf->frm_dim_cfg.width,
           oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
           NULL, cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
 
 static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int byte_alignment = cm->features.byte_alignment;
   if (aom_realloc_frame_buffer(
           &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL, 0))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
   // The frame buffer trial_frame_rst is used during loop restoration filter
   // search. Hence it is allocated only when loop restoration is used.
-  const int use_restoration = cm->seq_params.enable_restoration &&
+  const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
   if (use_restoration) {
@@ -342,7 +346,7 @@ static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) {
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
             AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL, 0))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate trial restored frame buffer");
   }
 
@@ -351,7 +355,7 @@ static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) {
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL,
           cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
   // The frame buffer cpi->scaled_last_source is used to hold the previous
@@ -367,7 +371,7 @@ static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) {
             seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
             byte_alignment, NULL, NULL, NULL,
             cpi->oxcf.tool_cfg.enable_global_motion))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate scaled last source buffer");
   }
 }
@@ -384,16 +388,16 @@ static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_source, scaled_width, scaled_height,
-          cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-          cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+          cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
           cm->features.byte_alignment, NULL, NULL, NULL,
           cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
   assert(cpi->scaled_source.y_crop_width == scaled_width);
   assert(cpi->scaled_source.y_crop_height == scaled_height);
   av1_resize_and_extend_frame_nonnormative(
-      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params.bit_depth,
+      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth,
       num_planes);
   return &cpi->scaled_source;
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c
index 7a7e8505b4..557268f9d3 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c
@@ -344,7 +344,7 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       seg->update_data = 1;
 
       qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
-                                    cm->seq_params.bit_depth);
+                                    cm->seq_params->bit_depth);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
@@ -459,13 +459,13 @@ void av1_apply_active_map(AV1_COMP *cpi) {
 
 #if !CONFIG_REALTIME_ONLY
 static void process_tpl_stats_frame(AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   AV1_COMMON *const cm = &cpi->common;
 
-  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
+  assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size));
 
-  const int tpl_idx = gf_group->index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
 
@@ -497,22 +497,23 @@ static void process_tpl_stats_frame(AV1_COMP *cpi) {
     } else {
       aom_clear_system_state();
       cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
-      if (is_frame_tpl_eligible(gf_group, gf_group->index)) {
-        if (cpi->lap_enabled) {
-          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
+      if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+        if (cpi->ppi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval);
           const int gfu_boost = get_gfu_boost_from_r0_lap(
               min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
-              cpi->rc.num_stats_required_for_gfu_boost);
+              cpi->ppi->p_rc.num_stats_required_for_gfu_boost);
           // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
           //        gfu_boost);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
-              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
-              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR,
+              cpi->ppi->p_rc.gfu_boost, gfu_boost,
+              cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
         } else {
           const int gfu_boost = (int)(200.0 / cpi->rd.r0);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
               MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+              cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
         }
       }
       aom_clear_system_state();
@@ -529,17 +530,17 @@ void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
   av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
 
 #if !CONFIG_REALTIME_ONLY
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   if (cpi->oxcf.algo_cfg.enable_tpl_model &&
-      is_frame_tpl_eligible(gf_group, gf_group->index)) {
+      is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
     process_tpl_stats_frame(cpi);
     av1_tpl_rdmult_setup(cpi);
   }
 #endif
 
   // Decide q and q bounds.
-  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
-                                cpi->gf_group.index, bottom_index, top_index);
+  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
+                                bottom_index, top_index);
 
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
@@ -564,6 +565,23 @@ static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
   memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
 }
 
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename ||
+      tune_cfg->content == AOM_CONTENT_FILM) {
+    seq_params->film_grain_params_present = 1;
+  } else {
+#if CONFIG_DENOISE
+    seq_params->film_grain_params_present = (oxcf->noise_level > 0);
+#else
+    seq_params->film_grain_params_present = 0;
+#endif
+  }
+}
+
 void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
                                       const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
@@ -577,39 +595,30 @@ void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
   }
 
   if (tune_cfg->film_grain_test_vector) {
-    cm->seq_params.film_grain_params_present = 1;
     if (cm->current_frame.frame_type == KEY_FRAME) {
       memcpy(&cm->film_grain_params,
              film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
              sizeof(cm->film_grain_params));
       if (oxcf->tool_cfg.enable_monochrome)
         reset_film_grain_chroma_params(&cm->film_grain_params);
-      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
-      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
+      cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+      if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) {
         cm->film_grain_params.clip_to_restricted_range = 0;
       }
     }
   } else if (tune_cfg->film_grain_table_filename) {
-    cm->seq_params.film_grain_params_present = 1;
-
     cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
     memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
 
     aom_film_grain_table_read(cpi->film_grain_table,
-                              tune_cfg->film_grain_table_filename, &cm->error);
+                              tune_cfg->film_grain_table_filename, cm->error);
   } else if (tune_cfg->content == AOM_CONTENT_FILM) {
-    cm->seq_params.film_grain_params_present = 1;
-    cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+    cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
     if (oxcf->tool_cfg.enable_monochrome)
       reset_film_grain_chroma_params(&cm->film_grain_params);
-    if (cm->seq_params.color_range == AOM_CR_FULL_RANGE)
+    if (cm->seq_params->color_range == AOM_CR_FULL_RANGE)
       cm->film_grain_params.clip_to_restricted_range = 0;
   } else {
-#if CONFIG_DENOISE
-    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
-#else
-    cm->seq_params.film_grain_params_present = 0;
-#endif
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
 }
@@ -643,7 +652,7 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
           if (aom_yv12_realloc_with_new_border(
                   &ref_fb->buf, AOM_BORDER_IN_PIXELS,
                   cm->features.byte_alignment, num_planes) != 0) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
         }
@@ -652,7 +661,7 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
         if (new_fb == NULL) {
           const int new_fb_idx = get_free_fb(cm);
           if (new_fb_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
           }
           force_scaling = 1;
@@ -663,30 +672,30 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
             new_fb->buf.y_crop_height != cm->height) {
           if (aom_realloc_frame_buffer(
                   &new_fb->buf, cm->width, cm->height,
-                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+                  cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
                   cm->features.byte_alignment, NULL, NULL, NULL, 0)) {
             if (force_scaling) {
               // Release the reference acquired in the get_free_fb() call above.
               --new_fb->ref_count;
             }
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
 #if CONFIG_AV1_HIGHBITDEPTH
-          if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8)
+          if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8)
             av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
                                         num_planes);
           else
             av1_resize_and_extend_frame_nonnormative(
-                ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
 #else
           if (use_optimized_scaler)
             av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
                                         num_planes);
           else
             av1_resize_and_extend_frame_nonnormative(
-                ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
 #endif
           cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
@@ -704,10 +713,8 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
   }
 }
 
-BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers) {
   if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
     return BLOCK_64X64;
   if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
@@ -715,7 +722,7 @@ BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) {
 
   assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
 
-  if (cpi->svc.number_spatial_layers > 1 ||
+  if (number_spatial_layers > 1 ||
       oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
     // Use the configured size (top resolution) for spatial layers or
     // on resize.
@@ -732,7 +739,7 @@ BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) {
   // speed-feature.
   if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
       oxcf->resize_cfg.resize_mode == RESIZE_NONE && oxcf->speed >= 1) {
-    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
+    return AOMMIN(width, height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
   }
 
   return BLOCK_128X128;
@@ -753,8 +760,10 @@ void av1_setup_frame(AV1_COMP *cpi) {
 
   if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
       frame_is_sframe(cm)) {
-    if (!cpi->seq_params_locked) {
-      set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
+    if (!cpi->ppi->seq_params_locked) {
+      set_sb_size(cm->seq_params,
+                  av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                     cpi->svc.number_spatial_layers));
     }
   } else {
     const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
@@ -959,7 +968,7 @@ void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
     av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
     if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
+                         cm->seq_params->bit_depth);
 
     av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
                                           0);
@@ -1005,13 +1014,13 @@ void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
 
-  if (!cm->seq_params.reduced_still_picture_hdr &&
+  if (!cm->seq_params->reduced_still_picture_hdr &&
       encode_show_existing_frame(cm)) {
     RefCntBuffer *const frame_to_show =
         cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
     if (frame_to_show == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Buffer does not contain a reconstructed frame");
     }
     assert(frame_to_show->ref_count > 0);
@@ -1019,7 +1028,7 @@ void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
   }
 
   if (!encode_show_existing_frame(cm) &&
-      cm->seq_params.film_grain_params_present &&
+      cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     // Copy the current frame's film grain params to the its corresponding
     // RefCntBuffer slot.
@@ -1232,7 +1241,7 @@ static void save_extra_coding_context(AV1_COMP *cpi) {
   cc->lf = cm->lf;
   cc->cdef_info = cm->cdef_info;
   cc->rc = cpi->rc;
-  cc->mv_stats = cpi->mv_stats;
+  cc->mv_stats = cpi->ppi->mv_stats;
 }
 
 void av1_save_all_coding_context(AV1_COMP *cpi) {
@@ -1301,11 +1310,11 @@ void av1_dump_filtered_recon_frames(AV1_COMP *cpi) {
       "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
       "refresh_alt_ref_frame=%d, "
       "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
-      current_frame->frame_number, cpi->gf_group.index,
-      cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint,
-      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
-      cpi->refresh_frame.alt_ref_frame, recon_buf->y_stride,
-      recon_buf->uv_stride, cm->width, cm->height);
+      current_frame->frame_number, cpi->gf_frame_index,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+      current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame,
+      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
 #if 0
   int ref_frame;
   printf("get_ref_frame_map_idx: [");
diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h
index 40652e956c..e75bc79ba6 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h
+++ b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h
@@ -125,14 +125,14 @@ static AOM_INLINE void init_buffer_indices(
 }
 
 #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
+  ppi->fn_ptr[BT].sdf = SDF;                                           \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                         \
+  ppi->fn_ptr[BT].vf = VF;                                             \
+  ppi->fn_ptr[BT].svf = SVF;                                           \
+  ppi->fn_ptr[BT].svaf = SVAF;                                         \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
 
 #define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
   HIGHBD_BFP(                                                                \
@@ -325,8 +325,8 @@ MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
-  cpi->fn_ptr[BT].msdf = MCSDF;       \
-  cpi->fn_ptr[BT].msvf = MCSVF;
+  ppi->fn_ptr[BT].msdf = MCSDF;       \
+  ppi->fn_ptr[BT].msvf = MCSVF;
 
 #define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD)                    \
   HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT,                           \
@@ -386,8 +386,8 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
 #endif
 
 #define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
-  cpi->fn_ptr[BT].sdsf = SDSF;          \
-  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+  ppi->fn_ptr[BT].sdsf = SDSF;          \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
   HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,                          \
@@ -487,9 +487,9 @@ MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
               aom_highbd_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
 
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;           \
-  cpi->fn_ptr[BT].ovf = OVF;             \
-  cpi->fn_ptr[BT].osvf = OSVF;
+  ppi->fn_ptr[BT].osdf = OSDF;           \
+  ppi->fn_ptr[BT].ovf = OVF;             \
+  ppi->fn_ptr[BT].osvf = OSVF;
 
 #define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
   HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                          \
@@ -542,10 +542,10 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
 #endif
 
-static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (cm->seq_params.use_highbitdepth) {
-    switch (cm->seq_params.bit_depth) {
+static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
       case AOM_BITS_8:
 #if !CONFIG_REALTIME_ONLY
         HIGHBD_BFP_WRAPPER(64, 16, 8)
@@ -850,7 +850,7 @@ static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) {
 
       default:
         assert(0 &&
-               "cm->seq_params.bit_depth should be AOM_BITS_8, "
+               "cm->seq_params->bit_depth should be AOM_BITS_8, "
                "AOM_BITS_10 or AOM_BITS_12");
     }
   }
@@ -873,6 +873,33 @@ static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
     av1_copy(frame_probs->switchable_interp_probs,
              default_switchable_interp_probs);
   }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+    av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs);
+  }
+  if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+    av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs);
+  }
+  if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    av1_copy(temp_frame_probs->warped_probs, default_warped_probs);
+  }
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    av1_copy(temp_frame_probs->switchable_interp_probs,
+             default_switchable_interp_probs);
+  }
+#endif
+}
+
+static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
+                                                   const CdefInfo *const src) {
+  dst->cdef_bits = src->cdef_bits;
+  dst->cdef_damping = src->cdef_damping;
+  av1_copy(dst->cdef_strengths, src->cdef_strengths);
+  av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths);
+  dst->nb_cdef_strengths = src->nb_cdef_strengths;
 }
 
 // Coding context that only needs to be restored when recode loop includes
@@ -882,9 +909,9 @@ static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
   cm->lf = cc->lf;
-  cm->cdef_info = cc->cdef_info;
+  restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info);
   cpi->rc = cc->rc;
-  cpi->mv_stats = cc->mv_stats;
+  cpi->ppi->mv_stats = cc->mv_stats;
 }
 
 static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
@@ -964,6 +991,8 @@ static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
   }
 }
 
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf);
 void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
                                       const AV1EncoderConfig *oxcf);
 
@@ -972,7 +1001,8 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
 
 void av1_setup_frame(AV1_COMP *cpi);
 
-BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi);
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers);
 
 void av1_apply_active_map(AV1_COMP *cpi);
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/encodetxb.c b/third_party/libaom/source/libaom/av1/encoder/encodetxb.c
index 7b0b281c80..0eb134890e 100644
--- a/third_party/libaom/source/libaom/av1/encoder/encodetxb.c
+++ b/third_party/libaom/source/libaom/av1/encoder/encodetxb.c
@@ -26,11 +26,11 @@
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
-  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
   const int num_planes = av1_num_planes(cm);
-  const int subsampling_x = cm->seq_params.subsampling_x;
-  const int subsampling_y = cm->seq_params.subsampling_y;
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
   const int chroma_max_sb_square =
       MAX_SB_SQUARE >> (subsampling_x + subsampling_y);
   const int num_tcoeffs =
@@ -624,6 +624,7 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
       const int coeff_ctx = coeff_contexts[pos];
       const tran_low_t v = qcoeff[pos];
       const tran_low_t level = abs(v);
+      td->abs_sum_level += level;
 
       if (allow_update_cdf) {
         if (c == eob - 1) {
@@ -719,7 +720,7 @@ void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
 CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
                                          int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
   const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
   const int offset =
       (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
diff --git a/third_party/libaom/source/libaom/av1/encoder/ethread.c b/third_party/libaom/source/libaom/av1/encoder/ethread.c
index 3735ca3c8b..d274b6b84f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/ethread.c
+++ b/third_party/libaom/source/libaom/av1/encoder/ethread.c
@@ -11,9 +11,11 @@
 
 #include "av1/common/warped_motion.h"
 
+#include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/ethread.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/firstpass.h"
@@ -52,7 +54,7 @@ static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
 static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  const int mib_size = cm->seq_params.mib_size;
+  const int mib_size = cm->seq_params->mib_size;
   const int frame_lf_count =
       av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
   for (int row = 0; row < cm->tiles.rows; row++) {
@@ -68,7 +70,8 @@ static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
           const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
           MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
           MB_MODE_INFO *mbmi = mi[0];
-          if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params.sb_size)) {
+          if (mbmi->skip_txfm == 1 &&
+              (mbmi->bsize == cm->seq_params->sb_size)) {
             for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
               mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
             mbmi->delta_lf_from_base = xd->delta_lf_from_base;
@@ -362,7 +365,7 @@ static AOM_INLINE void switch_tile_and_get_next_job(
     *cur_tile_id = tile_id;
     const int unit_height = mi_size_high[fp_block_size];
     get_next_job(&tile_data[tile_id], current_mi_row,
-                 is_firstpass ? unit_height : cm->seq_params.mib_size);
+                 is_firstpass ? unit_height : cm->seq_params->mib_size);
   }
 }
 
@@ -441,13 +444,20 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 
   const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
   int end_of_frame = 0;
+
+  // When master thread does not have a valid job to process, xd->tile_ctx
+  // is not set and it contains NULL pointer. This can result in NULL pointer
+  // access violation if accessed beyond the encode stage. Hence, updating
+  // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+  // context to avoid NULL pointer access in subsequent stages.
+  thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
   while (1) {
     int current_mi_row = -1;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
     if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
-                      cm->seq_params.mib_size)) {
+                      cm->seq_params->mib_size)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
       switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
@@ -470,6 +480,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
+    td->abs_sum_level = 0;
 
     if (this_tile->allow_update_cdf) {
       td->mb.row_ctx = this_tile->row_ctx;
@@ -482,7 +493,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
     av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
                            &td->mb.e_mbd);
 
-    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
     if (td->mb.txfm_search_info.txb_rd_records != NULL) {
       av1_crc32c_calculator_init(
           &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator);
@@ -492,6 +503,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
+    this_tile->abs_sum_level += td->abs_sum_level;
     row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
     pthread_mutex_unlock(enc_row_mt_mutex_);
@@ -526,16 +538,12 @@ static int enc_worker_hook(void *arg1, void *unused) {
   return 1;
 }
 
-void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
   AV1_COMMON *const cm = &cpi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   MultiThreadInfo *const mt_info = &cpi->mt_info;
 
-  assert(mt_info->workers != NULL);
-  assert(mt_info->tile_thr_data != NULL);
-
-#if CONFIG_MULTITHREAD
-  if (cpi->oxcf.row_mt == 1) {
+  if (is_first_pass || cpi->oxcf.row_mt == 1) {
     AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
     if (enc_row_mt->mutex_ == NULL) {
       CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
@@ -543,24 +551,39 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
       if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
     }
   }
-  AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
-  if (gm_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, gm_sync->mutex_,
-                    aom_malloc(sizeof(*(gm_sync->mutex_))));
-    if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
-  }
-  AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
-  if (tf_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_)));
-    if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
-  }
-  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
-  if (cdef_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
-                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
-    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+
+  if (!is_first_pass) {
+    AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+    if (gm_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+                      aom_malloc(sizeof(*(gm_sync->mutex_))));
+      if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+    }
+#if !CONFIG_REALTIME_ONLY
+    AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+    if (tf_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+                      aom_malloc(sizeof(*tf_sync->mutex_)));
+      if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+    AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+    if (cdef_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                      aom_malloc(sizeof(*(cdef_sync->mutex_))));
+      if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+    }
   }
-#endif
+}
+#endif  // CONFIG_MULTITHREAD
+
+void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
 
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
@@ -576,7 +599,7 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
 
       // Create threads
       if (!winterface->reset(worker))
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_ERROR,
                            "Tile encoder thread creation failed");
     } else {
       // Main thread acts as a worker and uses the thread data in cpi.
@@ -625,10 +648,6 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
 
       alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->tmp_conv_dst,
-          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
-                               sizeof(*thread_data->td->tmp_conv_dst)));
       for (int j = 0; j < 2; ++j) {
         CHECK_MEM_ERROR(
             cm, thread_data->td->tmp_pred_bufs[j],
@@ -636,9 +655,14 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
                                  sizeof(*thread_data->td->tmp_pred_bufs[j])));
       }
 
+      const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+      CHECK_MEM_ERROR(cm, thread_data->td->pixel_gradient_info,
+                      aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) *
+                                 plane_types * MAX_SB_SQUARE));
+
       if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
         const int num_64x64_blocks =
-            (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+            (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
         CHECK_MEM_ERROR(
             cm, thread_data->td->vt64x64,
             aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
@@ -680,6 +704,10 @@ void av1_create_workers(AV1_COMP *cpi, int num_workers) {
 
       // Set up shared coeff buffers.
       av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
     }
     ++mt_info->num_workers;
   }
@@ -724,7 +752,7 @@ static AOM_INLINE void fp_create_enc_workers(AV1_COMP *cpi, int num_workers) {
       if (create_workers) {
         // Create threads
         if (!winterface->reset(worker))
-          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+          aom_internal_error(cm->error, AOM_CODEC_ERROR,
                              "Tile encoder thread creation failed");
       }
     } else {
@@ -764,7 +792,7 @@ static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
   }
 
   if (had_error)
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Failed to encode tile data");
 }
 
@@ -780,14 +808,15 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
         !frame_is_intra_only(&cpi->common))
       av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh,
                                              &thread_data->td->mb);
-    if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
-      aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
-      thread_data->td->mb.txfm_search_info.txb_rd_records = NULL;
-    }
-    if (thread_data->td != &cpi->td &&
-        cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
-      aom_free(thread_data->td->mb.mv_costs);
+    if (thread_data->td != &cpi->td) {
+      if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.mv_costs);
+      }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.dv_costs);
+      }
     }
+    av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb);
 
     // Accumulate counters.
     if (i > 0) {
@@ -822,6 +851,7 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
 
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
+    thread_data->td->abs_sum_level = 0;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -846,15 +876,19 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
                sizeof(MvCosts));
       }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
+                        (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+        memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+               sizeof(IntraBCMVCosts));
+      }
     }
+    av1_alloc_mb_data(cm, &thread_data->td->mb,
+                      cpi->sf.rt_sf.use_nonrd_pick_mode);
+
     // Reset cyclic refresh counters.
     av1_init_cyclic_refresh_counters(&thread_data->td->mb);
 
-    if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
-      CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records,
-                      (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords)));
-    }
-
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
@@ -867,6 +901,8 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         thread_data->td->mb.tmp_pred_bufs[j] =
             thread_data->td->tmp_pred_bufs[j];
       }
+      thread_data->td->mb.pixel_gradient_info =
+          thread_data->td->pixel_gradient_info;
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
@@ -904,11 +940,16 @@ static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
                sizeof(MvCosts));
       }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
+                        (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+        memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+               sizeof(IntraBCMVCosts));
+      }
     }
-    if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
-      CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records,
-                      (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords)));
-    }
+
+    av1_alloc_mb_data(cm, &thread_data->td->mb,
+                      cpi->sf.rt_sf.use_nonrd_pick_mode);
   }
 }
 #endif
@@ -1191,13 +1232,15 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
   for (int i = num_workers - 1; i >= 0; i--) {
     EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
-    if (thread_data->td != &cpi->td &&
-        cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
-      aom_free(thread_data->td->mb.mv_costs);
-    }
-    if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
-      aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
+    if (thread_data->td != &cpi->td) {
+      if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.mv_costs);
+      }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.dv_costs);
+      }
     }
+    av1_dealloc_mb_data(cm, &thread_data->td->mb);
   }
 }
 
@@ -1277,11 +1320,15 @@ static int tpl_worker_hook(void *arg1, void *unused) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCK *x = &thread_data->td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
+  TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
   CommonModeInfoParams *mi_params = &cm->mi_params;
-  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   TX_SIZE tx_size = max_txsize_lookup[bsize];
   int mi_height = mi_size_high[bsize];
-  int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working;
+  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
   for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
        mi_row += num_active_workers * mi_height) {
     // Motion estimation row boundary
@@ -1290,7 +1337,7 @@ static int tpl_worker_hook(void *arg1, void *unused) {
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size);
+    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size);
   }
   return 1;
 }
@@ -1370,6 +1417,24 @@ static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       // OBMC buffers are used only to init MS params and remain unused when
       // called from tpl, hence set the buffers to defaults.
       av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+    }
+  }
+}
+
+// Accumulate transform stats after tpl.
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+                                      const MultiThreadInfo *mt_info,
+                                      int num_workers) {
+  TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    if (td != main_td) {
+      const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+      av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
     }
   }
 }
@@ -1379,7 +1444,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   CommonModeInfoParams *mi_params = &cm->mi_params;
   MultiThreadInfo *mt_info = &cpi->mt_info;
-  TplParams *tpl_data = &cpi->tpl_data;
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
   AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
   int mb_rows = mi_params->mb_rows;
   int num_workers =
@@ -1398,6 +1463,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
   prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
 }
 
 // Deallocate memory for temporal filter multi-thread synchronization.
@@ -1752,6 +1818,331 @@ void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+  const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+  const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+  if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+    return -1;
+  else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+    return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+  else
+    return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+    AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+  assert(pack_bs_sync->next_job_idx <= num_tiles);
+  if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+  return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+      .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+                                           const int frame_or_tg_size,
+                                           size_t *remain_buf_size,
+                                           size_t max_buf_size,
+                                           int is_last_chunk) {
+  size_t this_chunk_size;
+  assert(*remain_buf_size > 0);
+  if (is_last_chunk) {
+    this_chunk_size = *remain_buf_size;
+    *remain_buf_size = 0;
+  } else {
+    const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+    this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+    *remain_buf_size -= this_chunk_size;
+    assert(*remain_buf_size > 0);
+  }
+  assert(this_chunk_size > 0);
+  return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+                                     struct aom_write_bit_buffer *saved_wb,
+                                     PackBSParams *const pack_bs_params_arr,
+                                     uint8_t obu_extn_header) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  // Tile group size in terms of number of tiles.
+  const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+  uint8_t *tile_dst = dst;
+  uint8_t *tile_data_curr = dst;
+  // Max tile group count can not be more than MAX_TILES.
+  int tg_size_mi[MAX_TILES] = { 0 };  // Size of tile group in mi units
+  int tile_idx;
+  int tg_idx = 0;
+  int tile_count_in_tg = 0;
+  int new_tg = 1;
+
+  // Populate pack bitstream params of all tiles.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    // Calculate tile size in mi units.
+    const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+                             (tile_info->mi_row_end - tile_info->mi_row_start);
+    int is_last_tile_in_tg = 0;
+    tile_count_in_tg++;
+    if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+      is_last_tile_in_tg = 1;
+
+    // Populate pack bitstream params of this tile.
+    pack_bs_params->curr_tg_hdr_size = 0;
+    pack_bs_params->obu_extn_header = obu_extn_header;
+    pack_bs_params->saved_wb = saved_wb;
+    pack_bs_params->obu_header_size = 0;
+    pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+    pack_bs_params->new_tg = new_tg;
+    pack_bs_params->tile_col = tile_info->tile_col;
+    pack_bs_params->tile_row = tile_info->tile_row;
+    pack_bs_params->tile_size_mi = tile_size_mi;
+    tg_size_mi[tg_idx] += tile_size_mi;
+
+    if (new_tg) new_tg = 0;
+    if (is_last_tile_in_tg) {
+      tile_count_in_tg = 0;
+      new_tg = 1;
+      tg_idx++;
+    }
+  }
+
+  assert(cpi->available_bs_size > 0);
+  size_t tg_buf_size[MAX_TILES] = { 0 };
+  size_t max_buf_size = cpi->available_bs_size;
+  size_t remain_buf_size = max_buf_size;
+  const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+  tile_idx = 0;
+  // Prepare obu, tile group and frame header of each tile group.
+  for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    int is_last_tg = tg_idx == cpi->num_tg - 1;
+    // Prorate bitstream buffer size based on tile group size and available
+    // buffer size. This buffer will be used to store headers and tile data.
+    tg_buf_size[tg_idx] =
+        get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+                          max_buf_size, is_last_tg);
+
+    pack_bs_params->dst = tile_dst;
+    pack_bs_params->tile_data_curr = tile_dst;
+
+    // Write obu, tile group and frame header at first tile in the tile
+    // group.
+    av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+    tile_dst += tg_buf_size[tg_idx];
+
+    // Exclude headers from tile group buffer size.
+    tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+    tile_idx += tg_size_in_tiles;
+  }
+
+  tg_idx = 0;
+  // Calculate bitstream buffer size of each tile in the tile group.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+    if (pack_bs_params->new_tg) {
+      max_buf_size = tg_buf_size[tg_idx];
+      remain_buf_size = max_buf_size;
+    }
+
+    // Prorate bitstream buffer size of this tile based on tile size and
+    // available buffer size. For this proration, header size is not accounted.
+    const size_t tile_buf_size = get_bs_chunk_size(
+        pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+        max_buf_size, pack_bs_params->is_last_tile_in_tg);
+    pack_bs_params->tile_buf_size = tile_buf_size;
+
+    // Update base address of bitstream buffer for tile and tile group.
+    if (pack_bs_params->new_tg) {
+      tile_dst = pack_bs_params->dst;
+      tile_data_curr = pack_bs_params->tile_data_curr;
+      // Account header size in first tile of a tile group.
+      pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+    } else {
+      pack_bs_params->dst = tile_dst;
+      pack_bs_params->tile_data_curr = tile_data_curr;
+    }
+
+    if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+    tile_dst += pack_bs_params->tile_buf_size;
+  }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+
+  while (1) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_sync->mutex_);
+#endif
+    const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pack_bs_sync->mutex_);
+#endif
+    if (tile_idx == -1) break;
+    TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+    av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+  }
+
+  return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+                                    PackBSParams *const pack_bs_params,
+                                    AVxWorkerHook hook, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    if (i == 0) thread_data->td = &cpi->td;
+
+    if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+    thread_data->cpi = cpi;
+    thread_data->start = i;
+    thread_data->thread_id = i;
+    av1_reset_pack_bs_thread_data(thread_data->td);
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = pack_bs_params;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+  const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+#if CONFIG_MULTITHREAD
+  if (pack_bs_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+                    aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+    if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+  }
+#endif
+  pack_bs_sync->next_job_idx = 0;
+
+  PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+  // Reset tile order data of pack bitstream
+  av1_zero_array(pack_bs_tile_order, num_tiles);
+
+  // Populate pack bitstream tile order structure
+  for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    pack_bs_tile_order[tile_idx].abs_sum_level =
+        cpi->tile_data[tile_idx].abs_sum_level;
+    pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+  }
+
+  // Sort tiles in descending order based on tile area.
+  qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+        compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+    AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+    uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+    int *const largest_tile_id, unsigned int *max_tile_size,
+    uint32_t *const obu_header_size, uint8_t **tile_data_start,
+    const int num_workers) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_count = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  size_t curr_tg_data_size = 0;
+  int is_first_tg = 1;
+  uint8_t *curr_tg_start = dst;
+  size_t src_offset = 0;
+  size_t dst_offset = 0;
+
+  for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+    // PackBSParams stores all parameters required to pack tile and header
+    // info.
+    const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    uint32_t tile_size = 0;
+
+    if (pack_bs_params->new_tg) {
+      curr_tg_start = dst + *total_size;
+      curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+      *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+      *obu_header_size = pack_bs_params->obu_header_size;
+    }
+    curr_tg_data_size +=
+        pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+    if (pack_bs_params->buf.size > *max_tile_size) {
+      *largest_tile_id = tile_idx;
+      *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+    }
+    tile_size +=
+        (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+    // Pack all the chunks of tile bitstreams together
+    if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+    if (pack_bs_params->is_last_tile_in_tg)
+      av1_write_last_tile_info(
+          cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+          curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+          &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+    src_offset += pack_bs_params->tile_buf_size;
+    dst_offset += tile_size;
+    *total_size += tile_size;
+  }
+
+  // Accumulate thread data
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int idx = num_workers - 1; idx >= 0; idx--) {
+    ThreadData const *td = mt_info->tile_thr_data[idx].td;
+    av1_accumulate_pack_bs_thread_data(cpi, td);
+  }
+}
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers = mt_info->num_mod_workers[MOD_PACK_BS];
+
+  PackBSParams pack_bs_params[MAX_TILES];
+  uint32_t tile_size[MAX_TILES] = { 0 };
+
+  for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+    pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+  init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+  prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+                          num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
+  accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+                          largest_tile_id, max_tile_size, obu_header_size,
+                          tile_data_start, num_workers);
+}
+
 // Deallocate memory for CDEF search multi-thread synchronization.
 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
   (void)cdef_sync;
@@ -1780,6 +2171,9 @@ static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
 
 // Initializes cdef_sync parameters.
 static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif  // CONFIG_MULTITHREAD
   cdef_sync->end_of_frame = 0;
   cdef_sync->fbr = 0;
   cdef_sync->fbc = 0;
@@ -1896,6 +2290,12 @@ static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
   return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
+}
+
 int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
   int num_mod_workers = 0;
   switch (mod_name) {
@@ -1915,7 +2315,9 @@ int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
     case MOD_CDEF_SEARCH:
       num_mod_workers = compute_num_cdef_workers(cpi);
       break;
+    case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
     case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+    case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
     default: assert(0); break;
   }
   return (num_mod_workers);
diff --git a/third_party/libaom/source/libaom/av1/encoder/ethread.h b/third_party/libaom/source/libaom/av1/encoder/ethread.h
index 55e7f7be39..c2ab864690 100644
--- a/third_party/libaom/source/libaom/av1/encoder/ethread.h
+++ b/third_party/libaom/source/libaom/av1/encoder/ethread.h
@@ -80,6 +80,10 @@ int av1_get_max_num_workers(AV1_COMP *cpi);
 
 void av1_create_workers(AV1_COMP *cpi, int num_workers);
 
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
+#endif  // CONFIG_MULTITHREAD
+
 void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers);
 
 void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
@@ -87,6 +91,13 @@ void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
 
 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
 
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/libaom/source/libaom/av1/encoder/external_partition.c b/third_party/libaom/source/libaom/av1/encoder/external_partition.c
new file mode 100644
index 0000000000..542b2bb878
--- /dev/null
+++ b/third_party/libaom/source/libaom/av1/encoder/external_partition.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/encoder/external_partition.h"
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  ext_part_controller->funcs = funcs;
+  ext_part_controller->config = config;
+  const aom_ext_part_status_t status = ext_part_controller->funcs.create_model(
+      ext_part_controller->funcs.priv, &ext_part_controller->config,
+      &ext_part_controller->model);
+  if (status == AOM_EXT_PART_ERROR) {
+    return AOM_CODEC_ERROR;
+  } else if (status == AOM_EXT_PART_TEST) {
+    ext_part_controller->test_mode = 1;
+    ext_part_controller->ready = 0;
+    return AOM_CODEC_OK;
+  }
+  assert(status == AOM_EXT_PART_OK);
+  ext_part_controller->ready = 1;
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  av1_zero(ext_part_controller);
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (ext_part_controller->ready) {
+    const aom_ext_part_status_t status =
+        ext_part_controller->funcs.delete_model(ext_part_controller->model);
+    if (status != AOM_EXT_PART_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return av1_ext_part_init(ext_part_controller);
+}
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(decision != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.get_partition_decision(
+          ext_part_controller->model, decision);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(stats != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.send_partition_stats(
+          ext_part_controller->model, stats);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(features != NULL);
+  const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+      ext_part_controller->model, features);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
diff --git a/third_party/libaom/source/libaom/av1/encoder/external_partition.h b/third_party/libaom/source/libaom/av1/encoder/external_partition.h
new file mode 100644
index 0000000000..20f03ed752
--- /dev/null
+++ b/third_party/libaom/source/libaom/av1/encoder/external_partition.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+
+#include <stdbool.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+
+typedef struct ExtPartController {
+  int ready;
+  int test_mode;
+  aom_ext_part_config_t config;
+  aom_ext_part_model_t model;
+  aom_ext_part_funcs_t funcs;
+} ExtPartController;
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision);
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats);
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features);
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
diff --git a/third_party/libaom/source/libaom/av1/encoder/firstpass.c b/third_party/libaom/source/libaom/av1/encoder/firstpass.c
index ff6814d04c..662b42c822 100644
--- a/third_party/libaom/source/libaom/av1/encoder/firstpass.c
+++ b/third_party/libaom/source/libaom/av1/encoder/firstpass.c
@@ -27,6 +27,7 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/common/reconintra.h"
 #include "av1/common/txb_common.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
@@ -54,6 +55,8 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
+#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
+
 static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
                                     struct aom_codec_pkt_list *pktlist) {
   struct aom_codec_cx_pkt pkt;
@@ -108,6 +111,9 @@ void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
   section->new_mv_count = 0.0;
   section->count = 0.0;
   section->duration = 1.0;
+  section->is_flash = 0;
+  section->noise_var = 0;
+  section->cor_coeff = 1.0;
 }
 
 void av1_accumulate_stats(FIRSTPASS_STATS *section,
@@ -118,9 +124,11 @@ void av1_accumulate_stats(FIRSTPASS_STATS *section,
   section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
   section->coded_error += frame->coded_error;
   section->sr_coded_error += frame->sr_coded_error;
+  section->tr_coded_error += frame->tr_coded_error;
   section->pcnt_inter += frame->pcnt_inter;
   section->pcnt_motion += frame->pcnt_motion;
   section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_third_ref += frame->pcnt_third_ref;
   section->pcnt_neutral += frame->pcnt_neutral;
   section->intra_skip_pct += frame->intra_skip_pct;
   section->inactive_zone_rows += frame->inactive_zone_rows;
@@ -177,8 +185,9 @@ static int get_num_mbs(const BLOCK_SIZE fp_block_size,
 }
 
 void av1_end_first_pass(AV1_COMP *cpi) {
-  if (cpi->twopass.stats_buf_ctx->total_stats)
-    output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list);
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled)
+    output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats,
+                 cpi->ppi->output_pkt_list);
 }
 
 static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -261,15 +270,12 @@ static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
   const int sr = get_search_range(&cpi->initial_dimensions);
-  const int step_param = 3 + sr;
+  const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
 
   const search_site_config *first_pass_search_sites =
       cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
   const int fine_search_interval =
       cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
-  if (fine_search_interval) {
-    av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
-  }
   FULLPEL_MOTION_SEARCH_PARAMS ms_params;
   av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
                                      first_pass_search_sites,
@@ -281,7 +287,7 @@ static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
                                   &this_best_mv, NULL);
 
   if (tmp_err < INT_MAX) {
-    aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+    aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
     const MSBuffers *ms_buffers = &ms_params.ms_buffers;
     tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
                                  &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
@@ -355,6 +361,86 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   return raw_err_stdev;
 }
 
+static AOM_INLINE int do_third_ref_motion_search(const RateControlCfg *rc_cfg,
+                                                 const GFConfig *gf_cfg) {
+  return use_ml_model_to_decide_flat_gop(rc_cfg) && can_disable_altref(gf_cfg);
+}
+
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+  return (use_ml_model_to_decide_flat_gop(&oxcf->rc_cfg) &&
+          can_disable_altref(&oxcf->gf_cfg)) ||
+         (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL);
+}
+typedef struct intra_pred_block_pass1_args {
+  const SequenceHeader *seq_params;
+  MACROBLOCK *x;
+} intra_pred_block_pass1_args;
+
+static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+                             int sstride, int width, int height, int use_hbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride,
+                             CONVERT_TO_SHORTPTR(dst), dstride, width, height);
+  } else {
+    aom_convolve_copy(src, sstride, dst, dstride, width, height);
+  }
+#else
+  (void)use_hbd;
+  aom_convolve_copy(src, sstride, dst, dstride, width, height);
+#endif
+}
+
+static void first_pass_intra_pred_and_calc_diff(int plane, int block,
+                                                int blk_row, int blk_col,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  (void)block;
+  struct intra_pred_block_pass1_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const SequenceHeader *seq_params = args->seq_params;
+  const int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
+      src_stride, dst, dst_stride, blk_col, blk_row, plane);
+
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+}
+
+static void first_pass_predict_intra_block_for_luma_plane(
+    const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = AOM_PLANE_Y;
+  const MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = pd->dst.buf;
+  const MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int src_stride = p->src.stride;
+  const uint8_t *src = p->src.buf;
+
+  intra_pred_block_pass1_args args = { seq_params, x };
+  av1_foreach_transformed_block_in_plane(
+      xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);
+
+  // copy source data to recon buffer, as the recon buffer will be used as a
+  // reference frame subsequently.
+  copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize],
+            block_size_high[bsize], seq_params->use_highbitdepth);
+}
+
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
 // Computes and returns the intra pred error of a block.
@@ -388,11 +474,10 @@ static int firstpass_intra_prediction(
     const int qindex, FRAME_STATS *const stats) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int unit_scale = mi_size_wide[fp_block_size];
-  const int use_dc_pred = (unit_col || unit_row) && (!unit_col || !unit_row);
   const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE bsize =
       get_bsize(mi_params, fp_block_size, unit_row, unit_col);
@@ -412,9 +497,12 @@ static int firstpass_intra_prediction(
   xd->mi[0]->segment_id = 0;
   xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
   xd->mi[0]->mode = DC_PRED;
-  xd->mi[0]->tx_size = use_dc_pred ? max_txsize_lookup[bsize] : TX_4X4;
+  xd->mi[0]->tx_size = TX_4X4;
 
-  av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+  if (cpi->sf.fp_sf.disable_recon)
+    first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+  else
+    av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
   int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
   if (seq_params->use_highbitdepth) {
     switch (seq_params->bit_depth) {
@@ -480,16 +568,22 @@ static int firstpass_intra_prediction(
   // Accumulate the intra error.
   stats->intra_error += (int64_t)this_intra_error;
 
-  const int hbd = is_cur_buf_hbd(xd);
-  const int stride = x->plane[0].src.stride;
-  const int num_8x8_rows = block_size_high[fp_block_size] / 8;
-  const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
-  const uint8_t *buf = x->plane[0].src.buf;
-  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
-    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
-      stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
-          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
-    }
+  // Stats based on wavelet energy is used in the following cases :
+  // 1. ML model which predicts if a flat structure (golden-frame only structure
+  // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+  // constant quality mode under certain conditions.
+  // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+  // Thus, wavelet energy calculation is enabled for the above cases.
+  if (calc_wavelet_energy(&cpi->oxcf)) {
+    const int hbd = is_cur_buf_hbd(xd);
+    const int stride = x->plane[0].src.stride;
+    const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+    const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+    const uint8_t *buf = x->plane[0].src.buf;
+    stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input(
+        buf, stride, hbd, num_8x8_rows, num_8x8_cols);
+  } else {
+    stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
   }
 
   return this_intra_error;
@@ -516,13 +610,13 @@ static int get_prediction_error_bitdepth(const int is_high_bitdepth,
 static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
                                 const int mb_row, const int mb_col,
                                 const int mb_rows, const int mb_cols,
-                                MV *last_mv, FRAME_STATS *stats) {
+                                MV *last_non_zero_mv, FRAME_STATS *stats) {
   if (is_zero_mv(&best_mv)) return;
 
   ++stats->mv_count;
   // Non-zero vector, was it different from the last non zero vector?
-  if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count;
-  *last_mv = best_mv;
+  if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count;
+  *last_non_zero_mv = best_mv;
 
   // Does the row vector point inwards or outwards?
   if (mb_row < mb_rows / 2) {
@@ -555,7 +649,6 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
   }
 }
 
-#define LOW_MOTION_ERROR_THRESH 25
 // Computes and returns the inter prediction error from the last frame.
 // Computes inter prediction errors from the golden and alt ref frams and
 // Updates stats accordingly.
@@ -576,8 +669,9 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
 //   this_intra_error: the intra prediction error of this block.
 //   raw_motion_err_counts: the count of raw motion vectors.
 //   raw_motion_err_list: the array that records the raw motion error.
-//   best_ref_mv: best reference mv found so far.
-//   last_mv: last mv.
+//   ref_mv: the reference used to start the motion search
+//   best_mv: the best mv found
+//   last_non_zero_mv: the last non zero mv found in this tile row.
 //   stats: frame encoding stats.
 //  Modifies:
 //    raw_motion_err_list
@@ -593,8 +687,8 @@ static int firstpass_inter_prediction(
     const int unit_col, const int recon_yoffset, const int recon_uvoffset,
     const int src_yoffset, const int alt_ref_frame_yoffset,
     const BLOCK_SIZE fp_block_size, const int this_intra_error,
-    const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv,
-    MV *last_mv, FRAME_STATS *stats) {
+    const int raw_motion_err_counts, int *raw_motion_err_list, const MV ref_mv,
+    MV *best_mv, MV *last_non_zero_mv, FRAME_STATS *stats) {
   int this_inter_error = this_intra_error;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -612,7 +706,6 @@ static int firstpass_inter_prediction(
   const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
   // Assume 0,0 motion with no mv overhead.
   FULLPEL_MV mv = kZeroFullMv;
-  FULLPEL_MV tmp_mv = kZeroFullMv;
   xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
@@ -636,15 +729,15 @@ static int firstpass_inter_prediction(
       &unscaled_last_source_buf_2d);
   raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
 
-  // TODO(pengchong): Replace the hard-coded threshold
-  if (raw_motion_error > LOW_MOTION_ERROR_THRESH || cpi->oxcf.speed <= 2) {
+  if (raw_motion_error > cpi->sf.fp_sf.skip_motion_search_threshold) {
     // Test last reference frame using the previous best mv as the
     // starting point (best reference) for the search.
-    first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+    first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error);
 
     // If the current best reference mv is not centered on 0,0 then do a
     // 0,0 based search as well.
-    if (!is_zero_mv(best_ref_mv)) {
+    if (!is_zero_mv(&ref_mv)) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
       int tmp_err = INT_MAX;
       first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
 
@@ -657,6 +750,7 @@ static int firstpass_inter_prediction(
     // Motion search in 2nd reference frame.
     int gf_motion_error = motion_error;
     if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
       // Assume 0,0 motion with no mv overhead.
       xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
       xd->plane[0].pre[0].stride = golden_frame->y_stride;
@@ -682,13 +776,22 @@ static int firstpass_inter_prediction(
 
     // Motion search in 3rd reference frame.
     int alt_motion_error = motion_error;
-    if (alt_ref_frame != NULL) {
-      xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
-      xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
-      alt_motion_error =
-          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
-                                        &x->plane[0].src, &xd->plane[0].pre[0]);
-      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
+    // The ML model to predict if a flat structure (golden-frame only structure
+    // without ALT-REF and Internal-ARFs) is better requires stats based on
+    // motion search w.r.t 3rd reference frame in the first pass. As the ML
+    // model is enabled under certain conditions, motion search in 3rd reference
+    // frame is also enabled for those cases.
+    if (do_third_ref_motion_search(&cpi->oxcf.rc_cfg, &cpi->oxcf.gf_cfg)) {
+      if (alt_ref_frame != NULL) {
+        FULLPEL_MV tmp_mv = kZeroFullMv;
+        xd->plane[0].pre[0].buf =
+            alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
+        xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
+        alt_motion_error = get_prediction_error_bitdepth(
+            is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
+            &xd->plane[0].pre[0]);
+        first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
+      }
     }
     if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error &&
         alt_motion_error < this_intra_error) {
@@ -716,8 +819,7 @@ static int firstpass_inter_prediction(
   }
 
   // Start by assuming that intra mode is best.
-  best_ref_mv->row = 0;
-  best_ref_mv->col = 0;
+  *best_mv = kZeroMv;
 
   if (motion_error <= this_intra_error) {
     aom_clear_system_state();
@@ -736,28 +838,30 @@ static int firstpass_inter_prediction(
           (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
     }
 
-    const MV best_mv = get_mv_from_fullmv(&mv);
+    *best_mv = get_mv_from_fullmv(&mv);
     this_inter_error = motion_error;
     xd->mi[0]->mode = NEWMV;
-    xd->mi[0]->mv[0].as_mv = best_mv;
+    xd->mi[0]->mv[0].as_mv = *best_mv;
     xd->mi[0]->tx_size = TX_4X4;
     xd->mi[0]->ref_frame[0] = LAST_FRAME;
     xd->mi[0]->ref_frame[1] = NONE_FRAME;
-    av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
-                                  unit_col * unit_scale, NULL, bsize,
-                                  AOM_PLANE_Y, AOM_PLANE_Y);
-    av1_encode_sby_pass1(cpi, x, bsize);
-    stats->sum_mvr += best_mv.row;
-    stats->sum_mvr_abs += abs(best_mv.row);
-    stats->sum_mvc += best_mv.col;
-    stats->sum_mvc_abs += abs(best_mv.col);
-    stats->sum_mvrs += best_mv.row * best_mv.row;
-    stats->sum_mvcs += best_mv.col * best_mv.col;
+
+    if (cpi->sf.fp_sf.disable_recon == 0) {
+      av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
+                                    unit_col * unit_scale, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      av1_encode_sby_pass1(cpi, x, bsize);
+    }
+    stats->sum_mvr += best_mv->row;
+    stats->sum_mvr_abs += abs(best_mv->row);
+    stats->sum_mvc += best_mv->col;
+    stats->sum_mvc_abs += abs(best_mv->col);
+    stats->sum_mvrs += best_mv->row * best_mv->row;
+    stats->sum_mvcs += best_mv->col * best_mv->col;
     ++stats->inter_count;
 
-    *best_ref_mv = best_mv;
-    accumulate_mv_stats(best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
-                        last_mv, stats);
+    accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
+                        last_non_zero_mv, stats);
   }
 
   return this_inter_error;
@@ -783,7 +887,7 @@ static void update_firstpass_stats(AV1_COMP *cpi,
                                    const int frame_number,
                                    const int64_t ts_duration,
                                    const BLOCK_SIZE fp_block_size) {
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
@@ -817,6 +921,9 @@ static void update_firstpass_stats(AV1_COMP *cpi,
   fps.inactive_zone_rows = (double)stats->image_data_start_row;
   fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
   fps.raw_error_stdev = raw_err_stdev;
+  fps.is_flash = 0;
+  fps.noise_var = (double)0;
+  fps.cor_coeff = (double)1.0;
 
   if (stats->mv_count > 0) {
     fps.MVr = (double)stats->sum_mvr / stats->mv_count;
@@ -849,12 +956,20 @@ static void update_firstpass_stats(AV1_COMP *cpi,
   // cpi->source_time_stamp.
   fps.duration = (double)ts_duration;
 
+  // Invalidate the stats related to third ref motion search if not valid.
+  // This helps to print a warning in second pass encoding.
+  if (do_third_ref_motion_search(&cpi->oxcf.rc_cfg, &cpi->oxcf.gf_cfg) == 0) {
+    fps.pcnt_third_ref = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
+    fps.tr_coded_error = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
+  }
+
   // We will store the stats inside the persistent twopass struct (and NOT the
   // local variable 'fps'), and then cpi->output_pkt_list will point to it.
   *this_frame_stats = fps;
-  output_stats(this_frame_stats, cpi->output_pkt_list);
-  if (cpi->twopass.stats_buf_ctx->total_stats != NULL) {
-    av1_accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
+  if (!cpi->ppi->lap_enabled)
+    output_stats(this_frame_stats, cpi->ppi->output_pkt_list);
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) {
+    av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps);
   }
   /*In the case of two pass, first pass uses it as a circular buffer,
    * when LAP is enabled it is used as a linear buffer*/
@@ -982,6 +1097,17 @@ static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
+  const int num_planes = av1_num_planes(&cpi->common);
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int subsampling_xy =
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
+    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+    CHECK_MEM_ERROR(
+        cm, cpi->td.mb.plane[plane].src_diff,
+        (int16_t *)aom_memalign(
+            32, sizeof(*cpi->td.mb.plane[plane].src_diff) * sb_size));
+  }
   for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
       TileDataEnc *const tile_data =
@@ -989,6 +1115,12 @@ static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
       first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
     }
   }
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cpi->td.mb.plane[plane].src_diff) {
+      aom_free(cpi->td.mb.plane[plane].src_diff);
+      cpi->td.mb.plane[plane].src_diff = NULL;
+    }
+  }
 }
 
 void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
@@ -997,7 +1129,7 @@ void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo *tile = &tile_data->tile_info;
@@ -1105,7 +1237,7 @@ void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
           cpi, td, last_frame, golden_frame, alt_ref_frame, unit_row, unit_col,
           recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
           fp_block_size, this_intra_error, raw_motion_err_counts,
-          raw_motion_err_list, &best_ref_mv, &last_mv, mb_stats);
+          raw_motion_err_list, best_ref_mv, &best_ref_mv, &last_mv, mb_stats);
       if (unit_col_in_tile == 0) {
         *first_top_mv = last_mv;
       }
@@ -1138,7 +1270,7 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
@@ -1147,9 +1279,14 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
     FeatureFlags *const features = &cm->features;
     av1_set_screen_content_options(cpi, features);
   }
+
+  // Prepare the speed features
+  av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+
   // Unit size for the first pass encoding.
   const BLOCK_SIZE fp_block_size =
-      cpi->is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16;
+      get_fp_block_size(cpi->is_screen_content_type);
+
   // Number of rows in the unit size.
   // Note mi_params->mb_rows and mi_params->mb_cols are in the unit of 16x16.
   const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
@@ -1250,7 +1387,7 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
                       (stats.image_data_start_row * unit_cols * 2));
   }
 
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
   const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
                                 ? cpi->initial_mbs
                                 : mi_params->MBs;
diff --git a/third_party/libaom/source/libaom/av1/encoder/firstpass.h b/third_party/libaom/source/libaom/av1/encoder/firstpass.h
index 22969e885b..122912f72a 100644
--- a/third_party/libaom/source/libaom/av1/encoder/firstpass.h
+++ b/third_party/libaom/source/libaom/av1/encoder/firstpass.h
@@ -152,6 +152,18 @@ typedef struct {
    * standard deviation for (0, 0) motion prediction error
    */
   double raw_error_stdev;
+  /*!
+   * Whether the frame contains a flash
+   */
+  int64_t is_flash;
+  /*!
+   * Estimated noise variance
+   */
+  double noise_var;
+  /*!
+   * Correlation coefficient with the previous frame
+   */
+  double cor_coeff;
 } FIRSTPASS_STATS;
 
 /*!\cond */
@@ -170,8 +182,6 @@ enum {
  */
 typedef struct {
   /*!\cond */
-  // The frame processing order within a GOP
-  unsigned char index;
   // Frame update type, e.g. ARF/GF/LF/Overlay
   FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
   unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
@@ -191,6 +201,21 @@ typedef struct {
   REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
   int arf_index;  // the index in the gf group of ARF, if no arf, then -1
   int size;       // The total length of a GOP
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Indicates the level of parallelism in frame parallel encodes.
+  // 0 : frame is independently encoded (not part of parallel encodes).
+  // 1 : frame is the first in encode order in a given parallel encode set.
+  // 2 : frame occurs later in encode order in a given parallel encode set.
+  int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH];
+  // Indicates whether a frame should act as non-reference frame.
+  // 0 : frame is a reference frame.
+  // 1 : frame is a non-reference frame.
+  int is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH];
+
+  // The offset into lookahead_ctx for choosing
+  // source of frame parallel encodes.
+  int src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   /*!\endcond */
 } GF_GROUP;
 /*!\cond */
@@ -327,6 +352,15 @@ struct EncodeFrameParams;
 struct AV1EncoderConfig;
 struct TileDataEnc;
 
+static INLINE int is_fp_wavelet_energy_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+  return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
 int av1_get_unit_rows_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size);
 int av1_get_unit_cols_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size);
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c b/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c
index 31c69da7eb..01ef7b0843 100644
--- a/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c
+++ b/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c
@@ -108,10 +108,10 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
   const int do_adaptive_gm_estimation = 0;
 
   const int ref_frame_dist = get_relative_dist(
-      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+      &cm->seq_params->order_hint_info, cm->current_frame.order_hint,
       cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
   const GlobalMotionEstimationType gm_estimation_type =
-      cm->seq_params.order_hint_info.enable_order_hint &&
+      cm->seq_params->order_hint_info.enable_order_hint &&
               abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
           ? GLOBAL_MOTION_DISFLOW_BASED
           : GLOBAL_MOTION_FEATURE_BASED;
@@ -126,7 +126,7 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
 
     av1_compute_global_motion(model, src_buffer, src_width, src_height,
                               src_stride, src_corners, num_src_corners,
-                              ref_buf[frame], cpi->common.seq_params.bit_depth,
+                              ref_buf[frame], cpi->common.seq_params->bit_depth,
                               gm_estimation_type, inliers_by_motion,
                               params_by_motion, RANSAC_NUM_MOTIONS);
     int64_t ref_frame_error = 0;
@@ -284,9 +284,9 @@ static AOM_INLINE void update_valid_ref_frames_for_gm(
   AV1_COMMON *const cm = &cpi->common;
   int *num_past_ref_frames = &num_ref_frames[0];
   int *num_future_ref_frames = &num_ref_frames[1];
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
-      gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, gf_group->index);
+      gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index);
 
   for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
     const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
@@ -368,7 +368,7 @@ static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
     // The source buffer is 16-bit, so we need to convert to 8 bits for the
     // following code. We cache the result until the source frame is released.
     gm_info->src_buffer =
-        av1_downconvert_frame(source, cpi->common.seq_params.bit_depth);
+        av1_downconvert_frame(source, cpi->common.seq_params->bit_depth);
   }
 
   gm_info->segment_map_w =
diff --git a/third_party/libaom/source/libaom/av1/encoder/gop_structure.c b/third_party/libaom/source/libaom/av1/encoder/gop_structure.c
index 0e4968a72f..9cf72d2733 100644
--- a/third_party/libaom/source/libaom/av1/encoder/gop_structure.c
+++ b/third_party/libaom/source/libaom/av1/encoder/gop_structure.c
@@ -26,12 +26,51 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/gop_structure.h"
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based
+// on the value of parallel_frame_count.
+static void set_frame_parallel_level(int *frame_parallel_level,
+                                     int *parallel_frame_count,
+                                     int max_parallel_frames) {
+  assert(*parallel_frame_count > 0);
+  // parallel_frame_count > 1 indicates subsequent frame(s) in the current
+  // parallel encode set.
+  *frame_parallel_level = 1 + (*parallel_frame_count > 1);
+  // Update the count of no. of parallel frames.
+  (*parallel_frame_count)++;
+  if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1;
+}
+
+// This function sets gf_group->src_offset based on frame_parallel_level.
+// Outputs are gf_group->src_offset and first_frame_index
+static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
+                           int cur_frame_idx, int frame_ind) {
+  if (gf_group->frame_parallel_level[frame_ind] > 0) {
+    if (gf_group->frame_parallel_level[frame_ind] == 1) {
+      *first_frame_index = cur_frame_idx;
+    }
+
+    // Obtain the offset of the frame at frame_ind in the lookahead queue by
+    // subtracting the display order hints of the current frame from the display
+    // order hint of the first frame in parallel encoding set (at
+    // first_frame_index).
+    gf_group->src_offset[frame_ind] =
+        (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) -
+        *first_frame_index;
+  }
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 // Set parameters for frames between 'start' and 'end' (excluding both).
-static void set_multi_layer_params(const TWO_PASS *twopass,
-                                   GF_GROUP *const gf_group, RATE_CONTROL *rc,
-                                   FRAME_INFO *frame_info, int start, int end,
-                                   int *cur_frame_idx, int *frame_ind,
-                                   int layer_depth) {
+static void set_multi_layer_params(
+    const TWO_PASS *twopass, GF_GROUP *const gf_group,
+    const PRIMARY_RATE_CONTROL *p_rc, RATE_CONTROL *rc, FRAME_INFO *frame_info,
+    int start, int end, int *cur_frame_idx, int *frame_ind,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    int layer_depth) {
   const int num_frames_to_process = end - start;
 
   // Either we are at the last level of the pyramid, or we don't have enough
@@ -45,11 +84,21 @@ static void set_multi_layer_params(const TWO_PASS *twopass,
       gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
       gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
       gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
-          twopass, rc, frame_info, start, end - start, 0, NULL, NULL);
+          twopass, p_rc, rc, frame_info, start, end - start, 0, NULL, NULL, 0);
       gf_group->frame_type[*frame_ind] = INTER_FRAME;
       gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
       gf_group->max_layer_depth =
           AOMMAX(gf_group->max_layer_depth, layer_depth);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      // Set the level of parallelism for the LF_UPDATE frame.
+      if (do_frame_parallel_encode) {
+        set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                                 parallel_frame_count, max_parallel_frames);
+        // Set LF_UPDATE frames as non-reference frames.
+        gf_group->is_frame_non_ref[*frame_ind] = 1;
+      }
+      set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       ++(*frame_ind);
       ++(*cur_frame_idx);
       ++start;
@@ -65,14 +114,32 @@ static void set_multi_layer_params(const TWO_PASS *twopass,
     gf_group->frame_type[*frame_ind] = INTER_FRAME;
     gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (do_frame_parallel_encode) {
+      // If max_parallel_frames is not exceeded, encode the next internal ARF
+      // frame in parallel.
+      if (*parallel_frame_count > 1 &&
+          *parallel_frame_count <= max_parallel_frames) {
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+        *parallel_frame_count = 1;
+      }
+    }
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     // Get the boost factor for intermediate ARF frames.
     gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
-        twopass, rc, frame_info, m, end - m, m - start, NULL, NULL);
+        twopass, p_rc, rc, frame_info, m, end - m, m - start, NULL, NULL, 0);
     ++(*frame_ind);
 
     // Frames displayed before this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m,
-                           cur_frame_idx, frame_ind, layer_depth + 1);
+    set_multi_layer_params(twopass, gf_group, p_rc, rc, frame_info, start, m,
+                           cur_frame_idx, frame_ind,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                           parallel_frame_count, max_parallel_frames,
+                           do_frame_parallel_encode, first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                           layer_depth + 1);
 
     // Overlay for internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
@@ -82,12 +149,21 @@ static void set_multi_layer_params(const TWO_PASS *twopass,
     gf_group->layer_depth[*frame_ind] = layer_depth;
     gf_group->frame_type[*frame_ind] = INTER_FRAME;
     gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++(*frame_ind);
     ++(*cur_frame_idx);
 
     // Frames displayed after this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, m + 1, end,
-                           cur_frame_idx, frame_ind, layer_depth + 1);
+    set_multi_layer_params(twopass, gf_group, p_rc, rc, frame_info, m + 1, end,
+                           cur_frame_idx, frame_ind,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                           parallel_frame_count, max_parallel_frames,
+                           do_frame_parallel_encode, first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                           layer_depth + 1);
   }
 }
 
@@ -95,6 +171,7 @@ static int construct_multi_layer_gf_structure(
     AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
     RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval,
     FRAME_UPDATE_TYPE first_frame_update_type) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int frame_index = 0;
   int cur_frame_index = 0;
 
@@ -103,6 +180,18 @@ static int construct_multi_layer_gf_structure(
          first_frame_update_type == OVERLAY_UPDATE ||
          first_frame_update_type == GF_UPDATE);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Initialize gf_group->frame_parallel_level and gf_group->is_frame_non_ref to
+  // 0.
+  memset(
+      gf_group->frame_parallel_level, 0,
+      sizeof(gf_group->frame_parallel_level[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+  memset(gf_group->is_frame_non_ref, 0,
+         sizeof(gf_group->is_frame_non_ref[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+  memset(gf_group->src_offset, 0,
+         sizeof(gf_group->src_offset[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+#endif
+
   if (first_frame_update_type == KF_UPDATE &&
       cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
@@ -146,7 +235,7 @@ static int construct_multi_layer_gf_structure(
     gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
     gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = 1;
-    gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost;
+    gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost;
     gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
     gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
     gf_group->max_layer_depth = 1;
@@ -156,9 +245,25 @@ static int construct_multi_layer_gf_structure(
     gf_group->arf_index = -1;
   }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Running count of no. of frames that is part of a given parallel
+  // encode set in a gf_group. Value of 1 indicates no parallel encode.
+  int parallel_frame_count = 1;
+  // Enable parallel encode of frames if gf_group has a multi-layer pyramid
+  // structure.
+  int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref);
+
+  int first_frame_index = cur_frame_index;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
   // Rest of the frames.
-  set_multi_layer_params(twopass, gf_group, rc, frame_info, cur_frame_index,
-                         gf_interval, &cur_frame_index, &frame_index,
+  set_multi_layer_params(twopass, gf_group, p_rc, rc, frame_info,
+                         cur_frame_index, gf_interval, &cur_frame_index,
+                         &frame_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                         &parallel_frame_count, cpi->ppi->num_fp_contexts,
+                         do_frame_parallel_encode, &first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
                          use_altref + 1);
 
   if (use_altref) {
@@ -181,25 +286,41 @@ static int construct_multi_layer_gf_structure(
       gf_group->frame_type[frame_index] = INTER_FRAME;
       gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
       gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      set_src_offset(gf_group, &first_frame_index, cur_frame_index,
+                     frame_index);
+#endif
       ++frame_index;
     }
   }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  if (do_frame_parallel_encode) {
+    // If frame_parallel_level is set to 1 for the last LF_UPDATE
+    // frame in the gf_group, reset it to zero since there are no subsequent
+    // frames in the gf_group.
+    if (gf_group->frame_parallel_level[frame_index - 2] == 1) {
+      assert(gf_group->update_type[frame_index - 2] == LF_UPDATE);
+      gf_group->frame_parallel_level[frame_index - 2] = 0;
+    }
+  }
+#endif
   return frame_index;
 }
 
 void av1_gop_setup_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   const int key_frame = rc->frames_since_key == 0;
   const FRAME_UPDATE_TYPE first_frame_update_type =
-      key_frame
-          ? KF_UPDATE
-          : cpi->gf_state.arf_gf_boost_lst || (rc->baseline_gf_interval == 1)
-                ? OVERLAY_UPDATE
-                : GF_UPDATE;
+      key_frame ? KF_UPDATE
+                : cpi->ppi->gf_state.arf_gf_boost_lst ||
+                          (p_rc->baseline_gf_interval == 1)
+                      ? OVERLAY_UPDATE
+                      : GF_UPDATE;
   gf_group->size = construct_multi_layer_gf_structure(
-      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval - 1,
+      cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval - 1,
       first_frame_update_type);
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/gop_structure.h b/third_party/libaom/source/libaom/av1/encoder/gop_structure.h
index 6cfca22862..aeffb40acb 100644
--- a/third_party/libaom/source/libaom/av1/encoder/gop_structure.h
+++ b/third_party/libaom/source/libaom/av1/encoder/gop_structure.h
@@ -66,10 +66,11 @@ void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                             int64_t gf_group_bits);
 
 /*!\cond */
-int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const PRIMARY_RATE_CONTROL *p_rc, const RATE_CONTROL *rc,
                        FRAME_INFO *frame_info, int offset, int f_frames,
                        int b_frames, int *num_fpstats_used,
-                       int *num_fpstats_required);
+                       int *num_fpstats_required, int project_gfu_boost);
 /*!\endcond */
 
 #ifdef __cplusplus
diff --git a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c
index 08c167a9d6..eda5ddf78c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "av1/common/idct.h"
+#include "av1/common/blockd.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
@@ -313,3 +314,26 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     default: assert(0); break;
   }
 }
+
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride,
+                    tran_low_t *coeff) {
+  if (use_hadamard) {
+    switch (tx_size) {
+      case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+      case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+      case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+      case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+      default: assert(0);
+    }
+  } else {
+    TxfmParam txfm_param;
+    txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
+    txfm_param.lossless = 0;
+    txfm_param.bd = bd_info.bit_depth;
+    txfm_param.is_hbd = bd_info.use_highbitdepth_buf;
+    txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+    av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
+  }
+}
diff --git a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h
index daabc7119a..30f8a2258b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h
@@ -24,6 +24,15 @@ void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param);
 
+/*!\brief Apply Hadamard or DCT transform
+ *
+ * \callergraph
+ * DCT and Hadamard transforms are commonly used for quick RD score estimation.
+ * The coeff buffer's size should be equal to the number of pixels
+ * corresponding to tx_size.
+ */
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride, tran_low_t *coeff);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/libaom/source/libaom/av1/encoder/interp_search.c b/third_party/libaom/source/libaom/av1/encoder/interp_search.c
index 0066c35434..dd77f6a1c0 100644
--- a/third_party/libaom/source/libaom/av1/encoder/interp_search.c
+++ b/third_party/libaom/source/libaom/av1/encoder/interp_search.c
@@ -178,7 +178,7 @@ static INLINE int64_t interpolation_filter_rd(
   mbmi->interp_filters = filter_sets[filter_idx];
   const int tmp_rs =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
-                          cm->seq_params.enable_dual_filter);
+                          cm->seq_params->enable_dual_filter);
 
   int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
   if (min_rd > *rd) {
@@ -449,14 +449,23 @@ static INLINE void find_best_non_dual_interp_filter(
       interp_search_flags->interp_filter_search_mask;
 
   if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
     const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
-    const int *switchable_interp_p0 =
-        cpi->frame_probs.switchable_interp_probs[update_type][ctx0];
-    const int *switchable_interp_p1 =
-        cpi->frame_probs.switchable_interp_probs[update_type][ctx1];
-
+    int *switchable_interp_p0;
+    int *switchable_interp_p1;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs
+                               .switchable_interp_probs[update_type][ctx0];
+    switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs
+                               .switchable_interp_probs[update_type][ctx1];
+#else
+    switchable_interp_p0 =
+        (int *)cpi->frame_probs.switchable_interp_probs[update_type][ctx0];
+    switchable_interp_p1 =
+        (int *)cpi->frame_probs.switchable_interp_probs[update_type][ctx1];
+#endif
     static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
     const int thresh = thr[update_type];
     for (i = 0; i < SWITCHABLE_FILTERS; i++) {
@@ -683,7 +692,7 @@ int64_t av1_interpolation_filter_search(
   switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
   *switchable_rate =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
-                          cm->seq_params.enable_dual_filter);
+                          cm->seq_params->enable_dual_filter);
 
   // Do MC evaluation for default filter_type.
   // Luma MC
@@ -747,7 +756,7 @@ int64_t av1_interpolation_filter_search(
   restore_dst_buf(xd, *tmp_dst, num_planes);
   const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
   // Evaluate dual interp filters
-  if (cm->seq_params.enable_dual_filter) {
+  if (cm->seq_params->enable_dual_filter) {
     if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
       fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
                                  &rd_stats_luma, &rd_stats, switchable_rate,
diff --git a/third_party/libaom/source/libaom/av1/encoder/interp_search.h b/third_party/libaom/source/libaom/av1/encoder/interp_search.h
index 1ee26d11ba..902b69960a 100644
--- a/third_party/libaom/source/libaom/av1/encoder/interp_search.h
+++ b/third_party/libaom/source/libaom/av1/encoder/interp_search.h
@@ -37,7 +37,7 @@ typedef struct {
 
 /*!\brief Miscellaneous arguments for inter mode search.
  */
-typedef struct {
+typedef struct HandleInterModeArgs {
   /*!
    * Buffer for the above predictor in OBMC
    */
@@ -139,6 +139,16 @@ typedef struct {
    * Estimated cmp mode.
    */
   int cmp_mode[MODE_CTX_REF_FRAMES];
+  /*!
+   * The best sse during single new_mv search. Note that the sse here comes from
+   * single_motion_search, and not from interpolation_filter_search. This has
+   * two implications:
+   * 1. The mv used to calculate the sse here does not have to be the best sse
+   *    found in handle_inter_mode.
+   * 2. Even if the mvs agree, the sse here can differ from the sse in \ref
+   *    MACROBLOCK::pred_sse due to different interpolation filter used.
+   */
+  unsigned int best_single_sse_in_refs[REF_FRAMES];
 } HandleInterModeArgs;
 
 /*!\cond */
diff --git a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c
index 9cb0f4a118..50e53fdde1 100644
--- a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c
+++ b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c
@@ -32,6 +32,31 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
   UV_D113_PRED,   UV_D45_PRED,
 };
 
+// The bitmask corresponds to the filter intra modes as defined in enums.h
+// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding filter intra mode. The table
+// av1_derived_filter_intra_mode_used_flag is used when speed feature
+// prune_filter_intra_level is 1. The evaluated filter intra modes are union
+// of the following:
+// 1) FILTER_DC_PRED
+// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED,
+// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED).
+static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = {
+  0x01,  // DC_PRED:           0000 0001
+  0x03,  // V_PRED:            0000 0011
+  0x05,  // H_PRED:            0000 0101
+  0x01,  // D45_PRED:          0000 0001
+  0x01,  // D135_PRED:         0000 0001
+  0x01,  // D113_PRED:         0000 0001
+  0x09,  // D157_PRED:         0000 1001
+  0x01,  // D203_PRED:         0000 0001
+  0x01,  // D67_PRED:          0000 0001
+  0x01,  // SMOOTH_PRED:       0000 0001
+  0x01,  // SMOOTH_V_PRED:     0000 0001
+  0x01,  // SMOOTH_H_PRED:     0000 0001
+  0x11   // PAETH_PRED:        0001 0001
+};
+
 // The bitmask corresponds to the chroma intra modes as defined in enums.h
 // UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to
 // disable the evaluation of corresponding chroma intra mode. The table
@@ -60,59 +85,6 @@ static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = {
 };
 /*!\endcond */
 
-/*!\brief Calculate the rdcost of a given luma intra angle
- *
- * \ingroup intra_mode_search
- * \callergraph
- * This function runs rd calculation for a given luma intra prediction angle.
- * This is used to select the best angle delta.
- *
- * \return Returns the rdcost of the angle and updates the mbmi if the
- * new rdcost is better.
- */
-static int64_t calc_rd_given_intra_angle(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
-    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
-    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map,
-    uint8_t *best_blk_skip, int skip_model_rd) {
-  RD_STATS tokenonly_rd_stats;
-  int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  assert(!is_inter_block(mbmi));
-  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
-  if (!skip_model_rd) {
-    if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
-      return INT64_MAX;
-    }
-  }
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    best_rd_in);
-  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
-
-  int this_rate =
-      mode_cost + tokenonly_rd_stats.rate +
-      x->mode_costs
-          .angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
-  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-
-  if (this_rd < *best_rd) {
-    memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
-           sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
-    *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
-    *best_tx_size = mbmi->tx_size;
-    *rate = this_rate;
-    rd_stats->rate = tokenonly_rd_stats.rate;
-    rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
-  }
-  return this_rd;
-}
-
 /*!\brief Search for the best filter_intra mode when coding intra frame.
  *
  * \ingroup intra_mode_search
@@ -125,8 +97,12 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, int mode_cost,
+                                    PREDICTION_MODE best_mode_so_far,
                                     int64_t *best_rd, int64_t *best_model_rd,
                                     PICK_MODE_CONTEXT *ctx) {
+  // Skip the evaluation of filter intra modes.
+  if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0;
+
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   int filter_intra_selected_flag = 0;
@@ -134,17 +110,33 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   TX_SIZE best_tx_size = TX_8X8;
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  (void)ctx;
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra = 1;
   mbmi->mode = DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
 
+  // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have
+  // filter-intra as winner.
+  if (x->use_mb_mode_cache &&
+      !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra)
+    return 0;
+
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     int64_t this_rd;
     RD_STATS tokenonly_rd_stats;
     mbmi->filter_intra_mode_info.filter_intra_mode = mode;
 
+    if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) &&
+        !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] &
+          (1 << mode)))
+      continue;
+
+    // Skip the evaluation of modes that do not match with the winner mode in
+    // x->mb_mode_cache.
+    if (x->use_mb_mode_cache &&
+        mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode)
+      continue;
+
     if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
       continue;
     }
@@ -248,6 +240,42 @@ void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
   }
 }
 
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi) {
+  if (mode_idx < INTRA_MODE_END) {
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  } else {
+    mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+    int angle_delta = (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+    mbmi->angle_delta[PLANE_TYPE_Y] =
+        (angle_delta < 3 ? (angle_delta - 3) : (angle_delta - 2));
+  }
+}
+
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int model_cnt_allowed) {
+  const double thresh_best = 1.50;
+  const double thresh_top = 1.00;
+  for (int i = 0; i < model_cnt_allowed; i++) {
+    if (this_model_rd < top_intra_model_rd[i]) {
+      for (int j = model_cnt_allowed - 1; j > i; j--) {
+        top_intra_model_rd[j] = top_intra_model_rd[j - 1];
+      }
+      top_intra_model_rd[i] = this_model_rd;
+      break;
+    }
+  }
+  if (top_intra_model_rd[model_cnt_allowed - 1] != INT64_MAX &&
+      this_model_rd > thresh_top * top_intra_model_rd[model_cnt_allowed - 1])
+    return 1;
+
+  if (this_model_rd != INT64_MAX &&
+      this_model_rd > thresh_best * (*best_model_rd))
+    return 1;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  return 0;
+}
+
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
@@ -342,125 +370,199 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 #define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
   (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
-static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
-                             TX_SIZE tx_size, int64_t best_rd) {
+
+static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign,
+                                      int *cfl_alpha) {
+  int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO;
+  if (cfl_linear_idx == 0) {
+    *cfl_sign = CFL_SIGN_ZERO;
+    *cfl_alpha = 0;
+  } else {
+    *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG;
+    *cfl_alpha = abs(cfl_linear_idx) - 1;
+  }
+}
+
+static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              int plane, TX_SIZE tx_size,
+                              BLOCK_SIZE plane_bsize, int cfl_idx,
+                              int fast_mode, RD_STATS *rd_stats) {
+  assert(IMPLIES(fast_mode, rd_stats == NULL));
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U];
-  const ModeCosts *mode_costs = &x->mode_costs;
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
-
-  assert(is_cfl_allowed(xd) && cpi->oxcf.intra_mode_cfg.enable_cfl_intra);
-  assert(plane_bsize < BLOCK_SIZES_ALL);
-  if (!xd->lossless[mbmi->segment_id]) {
-    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
-    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+  int cfl_plane = get_cfl_pred_type(plane);
+  CFL_SIGN_TYPE cfl_sign;
+  int cfl_alpha;
+  cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha);
+  // We conly build CFL for a given plane, the other plane's sign is dummy
+  int dummy_sign = CFL_SIGN_NEG;
+  const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs;
+  const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx;
+  mbmi->cfl_alpha_signs =
+      PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign);
+  mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
+  int64_t cfl_cost;
+  if (fast_mode) {
+    cfl_cost =
+        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
+  } else {
+    av1_init_rd_stats(rd_stats);
+    av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
+                         tx_size, FTXS_NONE, 0);
+    av1_rd_cost_update(x->rdmult, rd_stats);
+    cfl_cost = rd_stats->rdcost;
   }
+  mbmi->cfl_alpha_signs = orig_cfl_alpha_signs;
+  mbmi->cfl_alpha_idx = orig_cfl_alpha_idx;
+  return cfl_cost;
+}
+
+static void cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     int plane, TX_SIZE tx_size,
+                                     int cfl_search_range,
+                                     RD_STATS cfl_rd_arr[CFL_MAGS_SIZE]) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->cfl.use_dc_pred_cache = 1;
-  const int64_t mode_rd = RDCOST(
-      x->rdmult,
-      mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
-  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-#if CONFIG_DEBUG
-  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-#endif  // CONFIG_DEBUG
-
-  const int skip_trellis = 0;
-  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
-    RD_STATS rd_stats;
-    av1_init_rd_stats(&rd_stats);
-    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-      best_rd_uv[joint_sign][plane] = INT64_MAX;
-      best_c[joint_sign][plane] = 0;
-    }
-    // Collect RD stats for an alpha value of zero in this plane.
-    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
-    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
-      const int8_t joint_sign =
-          PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
-      if (i == CFL_SIGN_NEG) {
-        mbmi->cfl_alpha_idx = 0;
-        mbmi->cfl_alpha_signs = joint_sign;
-        av1_txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1,
-                             plane_bsize, tx_size, FTXS_NONE, skip_trellis);
-        if (rd_stats.rate == INT_MAX) break;
-      }
-      const int alpha_rate = mode_costs->cfl_cost[joint_sign][plane][0];
-      best_rd_uv[joint_sign][plane] =
-          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
-#if CONFIG_DEBUG
-      best_rate_uv[joint_sign][plane] = rd_stats.rate;
-#endif  // CONFIG_DEBUG
-    }
-  }
 
-  int8_t best_joint_sign = -1;
-
-  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
-    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
-      int progress = 0;
-      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-        int flag = 0;
-        RD_STATS rd_stats;
-        if (c > 2 && progress < c) break;
-        av1_init_rd_stats(&rd_stats);
-        for (int i = 0; i < CFL_SIGNS; i++) {
-          const int8_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
-          if (i == 0) {
-            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
-            mbmi->cfl_alpha_signs = joint_sign;
-            av1_txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1,
-                                 plane_bsize, tx_size, FTXS_NONE, skip_trellis);
-            if (rd_stats.rate == INT_MAX) break;
-          }
-          const int alpha_rate = mode_costs->cfl_cost[joint_sign][plane][c];
-          int64_t this_rd =
-              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
-          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
-          best_rd_uv[joint_sign][plane] = this_rd;
-          best_c[joint_sign][plane] = c;
-#if CONFIG_DEBUG
-          best_rate_uv[joint_sign][plane] = rd_stats.rate;
-#endif  // CONFIG_DEBUG
-          flag = 2;
-          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
-          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
-          if (this_rd >= best_rd) continue;
-          best_rd = this_rd;
-          best_joint_sign = joint_sign;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->uv_mode == UV_CFL_PRED);
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+  const int dir_ls[2] = { 1, -1 };
+
+  int est_best_cfl_idx = CFL_INDEX_ZERO;
+  if (cfl_search_range < CFL_MAGS_SIZE) {
+    int fast_mode = 1;
+    int start_cfl_idx = CFL_INDEX_ZERO;
+    int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+                                           start_cfl_idx, fast_mode, NULL);
+    for (int si = 0; si < 2; ++si) {
+      const int dir = dir_ls[si];
+      for (int i = 1; i < CFL_MAGS_SIZE; ++i) {
+        int cfl_idx = start_cfl_idx + dir * i;
+        if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+        int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+                                          cfl_idx, fast_mode, NULL);
+        if (cfl_cost < best_cfl_cost) {
+          best_cfl_cost = cfl_cost;
+          est_best_cfl_idx = cfl_idx;
+        } else {
+          break;
         }
-        progress += flag;
       }
     }
   }
 
-  int best_rate_overhead = INT_MAX;
-  uint8_t ind = 0;
-  if (best_joint_sign >= 0) {
-    const int u = best_c[best_joint_sign][CFL_PRED_U];
-    const int v = best_c[best_joint_sign][CFL_PRED_V];
-    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
-    best_rate_overhead = mode_costs->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
-                         mode_costs->cfl_cost[best_joint_sign][CFL_PRED_V][v];
-#if CONFIG_DEBUG
-    xd->cfl.rate =
-        mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
-        best_rate_overhead + best_rate_uv[best_joint_sign][CFL_PRED_U] +
-        best_rate_uv[best_joint_sign][CFL_PRED_V];
-#endif  // CONFIG_DEBUG
-  } else {
-    best_joint_sign = 0;
+  for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
+    av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
   }
 
-  mbmi->cfl_alpha_idx = ind;
-  mbmi->cfl_alpha_signs = best_joint_sign;
+  int fast_mode = 0;
+  int start_cfl_idx = est_best_cfl_idx;
+  cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
+                 &cfl_rd_arr[start_cfl_idx]);
+  for (int si = 0; si < 2; ++si) {
+    const int dir = dir_ls[si];
+    for (int i = 1; i < cfl_search_range; ++i) {
+      int cfl_idx = start_cfl_idx + dir * i;
+      if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+      cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
+                     &cfl_rd_arr[cfl_idx]);
+    }
+  }
   xd->cfl.use_dc_pred_cache = 0;
   xd->cfl.dc_pred_is_cached[0] = 0;
   xd->cfl.dc_pred_is_cached[1] = 0;
-  return best_rate_overhead;
+}
+
+/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ *
+ * This function will use DCT_DCT followed by computing SATD (sum of absolute
+ * transformed differences) to estimate the RD score and find the best possible
+ * CFL parameter.
+ *
+ * Then the function will apply a full RD search near the best possible CFL
+ * parameter to find the best actual CFL parameter.
+ *
+ * Side effect:
+ * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
+ * search.
+ *
+ * \param[in] x                Encoder prediction block structure.
+ * \param[in] cpi              Top-level encoder instance structure.
+ * \param[in] tx_size          Transform size.
+ * \param[in] ref_best_rd      Reference best RD.
+ * \param[in] cfl_search_range The search range of full RD search near the
+ *                             estimated best CFL parameter.
+ *
+ * \param[out]   best_rd_stats          RD stats of the best CFL parameter
+ * \param[out]   best_cfl_alpha_idx     Best CFL alpha index
+ * \param[out]   best_cfl_alpha_signs   Best CFL joint signs
+ *
+ */
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t ref_best_rd,
+                             int cfl_search_range, RD_STATS *best_rd_stats,
+                             uint8_t *best_cfl_alpha_idx,
+                             int8_t *best_cfl_alpha_signs) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+  const ModeCosts *mode_costs = &x->mode_costs;
+  RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
+  RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
+
+  av1_invalid_rd_stats(best_rd_stats);
+
+  cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u);
+  cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v);
+
+  for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
+    if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
+    int cfl_alpha_u;
+    CFL_SIGN_TYPE cfl_sign_u;
+    cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
+    for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
+      if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
+      int cfl_alpha_v;
+      CFL_SIGN_TYPE cfl_sign_v;
+      cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
+      // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a
+      // valid parameter for CFL
+      if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
+      int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+      RD_STATS rd_stats = cfl_rd_arr_u[ui];
+      av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
+      if (rd_stats.rate != INT_MAX) {
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
+      }
+      av1_rd_cost_update(x->rdmult, &rd_stats);
+      if (rd_stats.rdcost < best_rd_stats->rdcost) {
+        *best_rd_stats = rd_stats;
+        *best_cfl_alpha_idx =
+            (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
+        *best_cfl_alpha_signs = joint_sign;
+      }
+    }
+  }
+  if (best_rd_stats->rdcost >= ref_best_rd) {
+    av1_invalid_rd_stats(best_rd_stats);
+    // Set invalid CFL parameters here since the rdcost is not better than
+    // ref_best_rd.
+    *best_cfl_alpha_idx = 0;
+    *best_cfl_alpha_signs = 0;
+    return 0;
+  }
+  return 1;
 }
 
 int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -532,19 +634,19 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->uv_mode = mode;
 
     // Init variables for cfl and angle delta
-    int cfl_alpha_rate = 0;
+    const SPEED_FEATURES *sf = &cpi->sf;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
     if (mode == UV_CFL_PRED) {
       if (!is_cfl_allowed(xd) || !intra_mode_cfg->enable_cfl_intra) continue;
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
-      if (cfl_alpha_rate == INT_MAX) continue;
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-
-    if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
-        intra_mode_cfg->enable_angle_delta) {
-      const SPEED_FEATURES *sf = &cpi->sf;
+      if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd,
+                             sf->intra_sf.cfl_search_range, &tokenonly_rd_stats,
+                             &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) {
+        continue;
+      }
+    } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+               intra_mode_cfg->enable_angle_delta) {
       if (sf->intra_sf.chroma_intra_pruning_with_hog &&
           !intra_search_state.dir_mode_skip_mask_ready) {
         static const float thresh[2][4] = {
@@ -554,7 +656,7 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         const int is_chroma = 1;
         const int is_intra_frame = frame_is_intra_only(cm);
         prune_intra_mode_with_hog(
-            x, bsize,
+            x, bsize, cm->seq_params->sb_size,
             thresh[is_intra_frame]
                   [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
             intra_search_state.directional_mode_skip_mask, is_chroma);
@@ -577,17 +679,9 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       }
     }
     const int mode_cost =
-        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
-        cfl_alpha_rate;
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
     this_rate = tokenonly_rd_stats.rate +
                 intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
-    if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(xd) && intra_mode_cfg->enable_cfl_intra);
-#if CONFIG_DEBUG
-      if (!xd->lossless[mbmi->segment_id])
-        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
-#endif  // CONFIG_DEBUG
-    }
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
@@ -633,8 +727,7 @@ int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   int rate2 = 0;
-  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
-          best_model_rd_palette = INT64_MAX;
+  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd;
   int skippable = 0;
   uint8_t *const best_palette_color_map =
       x->palette_buffer->best_palette_color_map;
@@ -656,11 +749,11 @@ int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
 
   RD_STATS rd_stats_y;
   av1_invalid_rd_stats(&rd_stats_y);
-  av1_rd_pick_palette_intra_sby(
-      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
-      best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
-      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
-      ctx, best_blk_skip, best_tx_type_map);
+  av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+                                &best_mbmi_palette, best_palette_color_map,
+                                &best_rd_palette, &rd_stats_y.rate, NULL,
+                                &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+                                ctx, best_blk_skip, best_tx_type_map);
   if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
     this_rd_cost->rdcost = INT64_MAX;
     return skippable;
@@ -766,81 +859,6 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   return 0;
 }
 
-/*!\brief Search for the best angle delta for luma prediction
- *
- * \ingroup intra_mode_search
- * \callergraph
- * Given a luma directional intra prediction mode, this function will try to
- * estimate the best delta_angle.
- *
- * \return Returns the new rdcost of the best intra angle.
- */
-static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int *rate, RD_STATS *rd_stats,
-                                       BLOCK_SIZE bsize, int mode_cost,
-                                       int64_t best_rd, int64_t *best_model_rd,
-                                       int skip_model_rd_for_zero_deg) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-
-  int best_angle_delta = 0;
-  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mbmi->tx_size;
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
-
-  int first_try = 1;
-  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      const int64_t best_rd_in =
-          (best_rd == INT64_MAX) ? INT64_MAX
-                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-      const int64_t this_rd = calc_rd_given_intra_angle(
-          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
-          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-          &best_rd, best_model_rd, best_tx_type_map, best_blk_skip,
-          (skip_model_rd_for_zero_deg & !angle_delta));
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (first_try && this_rd == INT64_MAX) return best_rd;
-      first_try = 0;
-      if (angle_delta == 0) {
-        rd_cost[1] = this_rd;
-        break;
-      }
-    }
-  }
-
-  assert(best_rd != INT64_MAX);
-  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      int skip_search = 0;
-      const int64_t rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        calc_rd_given_intra_angle(
-            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
-            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-            &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0);
-      }
-    }
-  }
-
-  if (rd_stats->rate != INT_MAX) {
-    mbmi->tx_size = best_tx_size;
-    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
-    const int n4 = bsize_to_num_blk(bsize);
-    memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
-           sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
-  }
-  return best_rd;
-}
-
 /*!\brief Search for the best filter_intra mode when coding inter frame.
  *
  * \ingroup intra_mode_search
@@ -909,11 +927,14 @@ static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
   }
 }
 
+// Evaluate a given luma intra-mode in inter frames.
 int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
                             const AV1_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize, unsigned int ref_frame_cost,
                             const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
-                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y) {
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]) {
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -928,7 +949,7 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
   int known_rate = mode_cost;
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
-      cm->seq_params.bit_depth);
+      cm->seq_params->bit_depth);
 
   if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
   known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
@@ -946,32 +967,34 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
         !intra_search_state->dir_mode_skip_mask_ready) {
       const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
       const int is_chroma = 0;
-      prune_intra_mode_with_hog(
-          x, bsize, thresh[sf->intra_sf.intra_pruning_with_hog - 1],
-          intra_search_state->directional_mode_skip_mask, is_chroma);
+      prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size,
+                                thresh[sf->intra_sf.intra_pruning_with_hog - 1],
+                                intra_search_state->directional_mode_skip_mask,
+                                is_chroma);
       intra_search_state->dir_mode_skip_mask_ready = 1;
     }
     if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
-    av1_init_rd_stats(rd_stats_y);
-    rd_stats_y->rate = INT_MAX;
-    int64_t model_rd = INT64_MAX;
-    int rate_dummy;
-    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost,
-                            best_rd, &model_rd, 0);
-
-  } else {
-    av1_init_rd_stats(rd_stats_y);
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
   }
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  const int64_t this_model_rd =
+      intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+  if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd,
+                         sf->intra_sf.top_intra_model_count_allowed))
+    return 0;
+  av1_init_rd_stats(rd_stats_y);
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
 
   // Pick filter intra modes.
   if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
     int try_filter_intra = 1;
     int64_t best_rd_so_far = INT64_MAX;
     if (rd_stats_y->rate != INT_MAX) {
-      const int tmp_rate = rd_stats_y->rate +
-                           mode_costs->filter_intra_cost[bsize][0] + mode_cost;
+      // best_rd_so_far is the rdcost of DC_PRED without using filter_intra.
+      // Later, in filter intra search, best_rd_so_far is used for comparison.
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      const int tmp_rate =
+          rd_stats_y->rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
       best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
       try_filter_intra = (best_rd_so_far / 2) <= best_rd;
     }
@@ -1095,7 +1118,8 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
     const int is_chroma = 0;
     prune_intra_mode_with_hog(
-        x, bsize, thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1],
+        x, bsize, cpi->common.seq_params->sb_size,
+        thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1],
         directional_mode_skip_mask, is_chroma);
   }
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
@@ -1105,16 +1129,21 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   set_mode_eval_params(cpi, x, MODE_EVAL);
 
   MB_MODE_INFO best_mbmi = *mbmi;
-  av1_zero(x->winner_mode_stats);
+  av1_zero_array(x->winner_mode_stats, MAX_WINNER_MODE_COUNT_INTRA);
   x->winner_mode_count = 0;
 
   // Searches the intra-modes except for intrabc, palette, and filter_intra.
-  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int is_diagonal_mode;
     int64_t this_distortion, this_rd;
-    mbmi->mode = intra_rd_search_mode_order[mode_idx];
 
     is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode);
     if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
@@ -1132,36 +1161,43 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       continue;
 
     // The functionality of filter intra modes and smooth prediction
-    // overlap. Retain the smooth prediction if filter intra modes are
-    // disabled.
+    // overlap. Hence smooth prediction is pruned only if all the
+    // filter intra modes are enabled.
     if (cpi->sf.intra_sf.disable_smooth_intra &&
-        !cpi->sf.intra_sf.disable_filter_intra && mbmi->mode == SMOOTH_PRED)
+        cpi->sf.intra_sf.prune_filter_intra_level == 0 &&
+        mbmi->mode == SMOOTH_PRED)
       continue;
     if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
         mbmi->mode == PAETH_PRED)
       continue;
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+
+    // Skip the evaluation of modes that do not match with the winner mode in
+    // x->mb_mode_cache.
+    if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue;
 
     is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode && av1_use_angle_delta(bsize) &&
-        cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
-      // Searches through the best angle_delta if this option is available.
-      this_rd_stats.rate = INT_MAX;
-      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
-                              bmode_costs[mbmi->mode], best_rd, &best_model_rd,
-                              1);
-    } else {
-      if (model_intra_yrd_and_prune(cpi, x, bsize, &best_model_rd)) {
-        continue;
-      }
+    if (is_directional_mode && av1_use_angle_delta(bsize) == 0 &&
+        mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
 
-      // Builds the actual prediction. The prediction from
-      // model_intra_yrd_and_prune was just an estimation that did not take into
-      // account the effect of txfm pipeline, so we need to redo it for real
-      // here.
-      av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-    }
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (!(cpi->sf.intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]] &
+          (1 << mbmi->mode)))
+      continue;
+
+    const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+    const int64_t this_model_rd =
+        intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+    if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd,
+                           cpi->sf.intra_sf.top_intra_model_count_allowed))
+      continue;
+
+    // Builds the actual prediction. The prediction from
+    // model_intra_yrd_and_prune was just an estimation that did not take into
+    // account the effect of txfm pipeline, so we need to redo it for real
+    // here.
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
     s = this_rd_stats.skip_txfm;
@@ -1204,16 +1240,16 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (try_palette) {
     av1_rd_pick_palette_intra_sby(
         cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
-        &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
-        &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
+        &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd,
+        ctx, ctx->blk_skip, ctx->tx_type_map);
   }
 
   // Searches filter_intra
-  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize) &&
-      !cpi->sf.intra_sf.disable_filter_intra) {
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                                  skippable, bsize, bmode_costs[DC_PRED],
-                                 &best_rd, &best_model_rd, ctx)) {
+                                 best_mbmi.mode, &best_rd, &best_model_rd,
+                                 ctx)) {
       best_mbmi = *mbmi;
     }
   }
diff --git a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h
index cc2a87b098..5a52440909 100644
--- a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h
+++ b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h
@@ -95,6 +95,9 @@ typedef struct IntraModeSearchState {
  * \param[out]       mode_cost_y        The cost needed to signal the current
  *                                      intra mode.
  * \param[out]       rd_y               The rdcost of the chosen mode.
+ * \param[in]        best_model_rd      Best model RD seen for this block so far
+ * \param[in]        top_intra_model_rd Top intra model RD seen for this
+ *                                      block so far.
  *
  * \return Returns 1 if a valid intra mode is found, 0 otherwise.
  * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
@@ -106,7 +109,9 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
                             const AV1_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize, unsigned int ref_frame_cost,
                             const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
-                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y);
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]);
 
 /*!\brief Search through all chroma intra-modes for inter frames.
  *
@@ -262,6 +267,29 @@ static AOM_INLINE void init_intra_mode_search_state(
   intra_search_state->rate_uv_intra = INT_MAX;
 }
 
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in]    mode_idx           mode index in intra mode decision
+ *                                  process.
+ * \param[in]    mbmi               Pointer to structure holding
+ *                                  the mode info for the current macroblock.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi);
+
+/*! \brief prune luma intra mode    based on the model rd.
+ * \param[in]    this_model_rd      model rd for current mode.
+ * \param[in]    best_model_rd      Best model RD seen for this block so
+ *                                  far.
+ * \param[in]    top_intra_model_rd Top intra model RD seen for this
+ *                                  block so far.
+ * \param[in]    model_cnt_allowed  The number of top intra model RD allowed.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int model_cnt_allowed);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h
index 532482896a..0bf77ac9f5 100644
--- a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h
+++ b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h
@@ -22,8 +22,10 @@
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/palette.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -134,8 +136,13 @@ static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
 }
 #undef FIX_PREC_BITS
 
-static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
-                                    int cols, float *hist) {
+// Normalizes the hog data.
+static AOM_INLINE void normalize_hog(float total, float *hist) {
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
+                                          int rows, int cols, float *hist) {
   float total = 0.1f;
   src += stride;
   for (int r = 1; r < rows - 1; ++r) {
@@ -144,7 +151,7 @@ static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
       const uint8_t *below = &src[c + stride];
       const uint8_t *left = &src[c - 1];
       const uint8_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
+      // Calculate gradient using Sobel filters.
       const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
                      (left[-stride] + 2 * left[0] + left[stride]);
       const int dy = (below[-1] + 2 * below[0] + below[1]) -
@@ -165,13 +172,49 @@ static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
     src += stride;
   }
 
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+  normalize_hog(total, hist);
 }
 
-static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
-                                        int rows, int cols, float *hist) {
+// Computes and stores pixel level gradient information of a given superblock
+// for LBD encode.
+static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                      BLOCK_SIZE sb_size,
+                                                      PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint8_t *src = x->plane[plane].src.buf;
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
+                                           int rows, int cols, float *hist) {
   float total = 0.1f;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   src += stride;
   for (int r = 1; r < rows - 1; ++r) {
     for (int c = 1; c < cols - 1; ++c) {
@@ -179,7 +222,7 @@ static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
       const uint16_t *below = &src[c + stride];
       const uint16_t *left = &src[c - 1];
       const uint16_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
+      // Calculate gradient using Sobel filters.
       const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
                      (left[-stride] + 2 * left[0] + left[stride]);
       const int dy = (below[-1] + 2 * below[0] + below[1]) -
@@ -200,11 +243,151 @@ static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
     src += stride;
   }
 
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+  normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for HBD encode.
+static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                       BLOCK_SIZE sb_size,
+                                                       PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
+                                    int cols, float *hist, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    highbd_generate_hog(src8, stride, rows, cols, hist);
+    return;
+  }
+#else
+  (void)highbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_generate_hog(src8, stride, rows, cols, hist);
+}
+
+static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
+                                                BLOCK_SIZE sb_size,
+                                                PLANE_TYPE plane) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    highbd_compute_gradient_info_sb(x, sb_size, plane);
+    return;
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_compute_gradient_info_sb(x, sb_size, plane);
+}
+
+// Function to generate pixel level gradient information for a given superblock.
+// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
+// gradient info is generated for the same.
+static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+                                                BLOCK_SIZE sb_size, int mi_row,
+                                                int mi_col) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+  // Initialise flags related to hog data caching.
+  x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
+  x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
+
+  // SB level caching of gradient data may not help in speedup for the following
+  // cases:
+  // (1) Inter frames (due to early intra gating)
+  // (2) When partition_search_type is not SEARCH_PARTITION
+  // Hence, gradient data is computed at block level in such cases.
+
+  if (!frame_is_intra_only(&cpi->common) ||
+      sf->part_sf.partition_search_type != SEARCH_PARTITION)
+    return;
+
+  const int num_planes = av1_num_planes(&cpi->common);
+
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+  if (sf->intra_sf.intra_pruning_with_hog) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y);
+    x->is_sb_gradient_cached[PLANE_TYPE_Y] = true;
+  }
+  if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV);
+    x->is_sb_gradient_cached[PLANE_TYPE_UV] = true;
+  }
+}
+
+// Reuses the pixel level gradient data generated at superblock level for block
+// level histogram computation.
+static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+                                                         int rows, int cols,
+                                                         BLOCK_SIZE sb_size,
+                                                         PLANE_TYPE plane,
+                                                         float *hist) {
+  float total = 0.1f;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+
+  // Derive the offset from the starting of the superblock in order to locate
+  // the block level gradient data in the cache.
+  const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+  const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+  const int block_offset_in_grad_cache =
+      sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) +
+      (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x));
+  const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info +
+                                                plane * MAX_SB_SQUARE +
+                                                block_offset_in_grad_cache;
+
+  // Retrieve the cached gradient information and generate the histogram.
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t abs_dx_abs_dy_sum =
+          grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum;
+      if (!abs_dx_abs_dy_sum) continue;
+      total += abs_dx_abs_dy_sum;
+      const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero;
+      if (is_dx_zero) {
+        hist[0] += abs_dx_abs_dy_sum >> 1;
+        hist[BINS - 1] += abs_dx_abs_dy_sum >> 1;
+      } else {
+        const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx;
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += abs_dx_abs_dy_sum;
+      }
+    }
+  }
+  normalize_hog(total, hist);
 }
 
 static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                    int plane, float *hog) {
+                                    BLOCK_SIZE sb_size, int plane, float *hog) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ss_x = pd->subsampling_x;
@@ -217,12 +400,15 @@ static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
   const int cols =
       ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
       ss_x;
-  const int src_stride = x->plane[plane].src.stride;
-  const uint8_t *src = x->plane[plane].src.buf;
-  if (is_cur_buf_hbd(xd)) {
-    generate_hog_hbd(src, src_stride, rows, cols, hog);
+
+  // If gradient data is already generated at SB level, reuse the cached data.
+  // Otherwise, compute the data.
+  if (x->is_sb_gradient_cached[plane]) {
+    generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog);
   } else {
-    generate_hog(src, src_stride, rows, cols, hog);
+    const uint8_t *src = x->plane[plane].src.buf;
+    const int src_stride = x->plane[plane].src.stride;
+    generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd));
   }
 
   // Scale the hog so the luma and chroma are on the same scale
@@ -232,13 +418,13 @@ static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
 }
 
 static AOM_INLINE void prune_intra_mode_with_hog(
-    const MACROBLOCK *x, BLOCK_SIZE bsize, float th,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
     uint8_t *directional_mode_skip_mask, int is_chroma) {
   aom_clear_system_state();
 
   const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
   float hist[BINS] = { 0.0f };
-  collect_hog_data(x, bsize, plane, hist);
+  collect_hog_data(x, bsize, sb_size, plane, hist);
 
   // Make prediction for each of the mode
   float scores[DIRECTIONAL_MODES] = { 0.0f };
@@ -305,7 +491,7 @@ static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
       const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
       palette_mode_cost +=
           av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.seq_params.bit_depth);
+                                   n_cache, cpi->common.seq_params->bit_depth);
       palette_mode_cost +=
           av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       total_rate += palette_mode_cost;
@@ -365,7 +551,7 @@ static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
       uint16_t color_cache[2 * PALETTE_MAX_SIZE];
       const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
       palette_mode_cost += av1_palette_color_cost_uv(
-          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+          pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth);
       palette_mode_cost +=
           av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
       total_rate += palette_mode_cost;
@@ -385,11 +571,11 @@ static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
 /*!\cond */
 // Makes a quick intra prediction and estimate the rdcost with a model without
 // going through the whole txfm/quantize/itxfm process.
-static int64_t intra_model_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
                               int plane, BLOCK_SIZE plane_bsize,
-                              TX_SIZE tx_size) {
-  const AV1_COMMON *cm = &cpi->common;
+                              TX_SIZE tx_size, int use_hadamard) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   int row, col;
   assert(!is_inter_block(xd->mi[0]));
   const int stepr = tx_size_high_unit[tx_size];
@@ -405,27 +591,16 @@ static int64_t intra_model_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
       av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+      // Here we use p->src_diff and p->coeff as temporary buffers for
+      // prediction residue and transform coefficients. The buffers are only
+      // used in this for loop, therefore we don't need to properly add offset
+      // to the buffers.
       av1_subtract_block(
-          xd, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+          bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
           p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
           pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
-      switch (tx_size) {
-        case TX_4X4:
-          aom_hadamard_4x4(p->src_diff, block_size_wide[plane_bsize], p->coeff);
-          break;
-        case TX_8X8:
-          aom_hadamard_8x8(p->src_diff, block_size_wide[plane_bsize], p->coeff);
-          break;
-        case TX_16X16:
-          aom_hadamard_16x16(p->src_diff, block_size_wide[plane_bsize],
-                             p->coeff);
-          break;
-        case TX_32X32:
-          aom_hadamard_32x32(p->src_diff, block_size_wide[plane_bsize],
-                             p->coeff);
-          break;
-        default: assert(0);
-      }
+      av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
+                     block_size_wide[plane_bsize], p->coeff);
       satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
     }
   }
@@ -448,7 +623,9 @@ static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
                                                 int64_t *best_model_rd) {
   const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
   const int plane = 0;
-  const int64_t this_model_rd = intra_model_rd(cpi, x, plane, bsize, tx_size);
+  const AV1_COMMON *cm = &cpi->common;
+  const int64_t this_model_rd =
+      intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1);
   if (*best_model_rd != INT64_MAX &&
       this_model_rd > *best_model_rd + (*best_model_rd >> 2)) {
     return 1;
diff --git a/third_party/libaom/source/libaom/av1/encoder/level.c b/third_party/libaom/source/libaom/av1/encoder/level.c
index 7a74c460e4..4e1749a1dd 100644
--- a/third_party/libaom/source/libaom/av1/encoder/level.c
+++ b/third_party/libaom/source/libaom/av1/encoder/level.c
@@ -353,7 +353,7 @@ static double time_to_decode_frame(const AV1_COMMON *const cm,
     if (spatial_layer_dimensions_present_flag) {
       assert(0 && "Spatial layer dimensions not supported yet.");
     } else {
-      const SequenceHeader *const seq_params = &cm->seq_params;
+      const SequenceHeader *const seq_params = cm->seq_params;
       const int max_frame_width = seq_params->max_frame_width;
       const int max_frame_height = seq_params->max_frame_height;
       luma_samples = max_frame_width * max_frame_height;
@@ -473,7 +473,7 @@ void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
   decoder_model->level = level;
 
   const AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   decoder_model->bit_rate = get_max_bitrate(
       av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
 
@@ -690,7 +690,7 @@ void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
 void av1_init_level_info(AV1_COMP *cpi) {
   for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
     AV1LevelInfo *const this_level_info =
-        cpi->level_params.level_info[op_index];
+        cpi->ppi->level_params.level_info[op_index];
     if (!this_level_info) continue;
     memset(this_level_info, 0, sizeof(*this_level_info));
     AV1LevelSpec *const level_spec = &this_level_info->level_spec;
@@ -1048,7 +1048,7 @@ static void scan_past_frames(const FrameWindowBuffer *const buffer,
 void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
                            int64_t ts_end) {
   AV1_COMMON *const cm = &cpi->common;
-  const AV1LevelParams *const level_params = &cpi->level_params;
+  const AV1LevelParams *const level_params = &cpi->ppi->level_params;
 
   const int upscaled_width = cm->superres_upscaled_width;
   const int width = cm->width;
@@ -1057,7 +1057,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
   const int tile_rows = cm->tiles.rows;
   const int tiles = tile_cols * tile_rows;
   const int luma_pic_size = upscaled_width * height;
-  const int frame_header_count = level_params->frame_header_count;
+  const int frame_header_count = cpi->frame_header_count;
   const int show_frame = cm->show_frame;
   const int show_existing_frame = cm->show_existing_frame;
 
@@ -1075,7 +1075,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
 
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int is_still_picture = seq_params->still_picture;
   // update level_stats
@@ -1148,7 +1148,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
       if (fail_id != TARGET_LEVEL_OK) {
         const int target_level_major = 2 + (target_level >> 2);
         const int target_level_minor = target_level & 3;
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_ERROR,
                            "Failed to encode to the target level %d_%d. %s",
                            target_level_major, target_level_minor,
                            level_fail_messages[fail_id]);
diff --git a/third_party/libaom/source/libaom/av1/encoder/level.h b/third_party/libaom/source/libaom/av1/encoder/level.h
index 5e0cce2007..2800e3d40d 100644
--- a/third_party/libaom/source/libaom/av1/encoder/level.h
+++ b/third_party/libaom/source/libaom/av1/encoder/level.h
@@ -164,8 +164,6 @@ typedef struct AV1LevelParams {
   uint32_t keep_level_stats;
   // Level information for each operating point.
   AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
-  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
-  int frame_header_count;
 } AV1LevelParams;
 
 static INLINE int is_in_operating_point(int operating_point,
diff --git a/third_party/libaom/source/libaom/av1/encoder/mcomp.c b/third_party/libaom/source/libaom/av1/encoder/mcomp.c
index 06f9386102..1a53c23c74 100644
--- a/third_party/libaom/source/libaom/av1/encoder/mcomp.c
+++ b/third_party/libaom/source/libaom/av1/encoder/mcomp.c
@@ -95,7 +95,7 @@ void av1_make_default_fullpel_ms_params(
 
   // High level params
   ms_params->bsize = bsize;
-  ms_params->vfp = &cpi->fn_ptr[bsize];
+  ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
 
   init_ms_buffers(&ms_params->ms_buffers, x);
 
@@ -145,8 +145,8 @@ void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
 
   mv_cost_params->mvjcost = dv_costs->joint_mv;
-  mv_cost_params->mvcost[0] = &dv_costs->mv_component[0][MV_MAX];
-  mv_cost_params->mvcost[1] = &dv_costs->mv_component[1][MV_MAX];
+  mv_cost_params->mvcost[0] = dv_costs->dv_costs[0];
+  mv_cost_params->mvcost[1] = dv_costs->dv_costs[1];
 }
 
 void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -167,7 +167,7 @@ void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
                       x->errorperbit, x->sadperbit);
 
   // Subpel variance params
-  ms_params->var_params.vfp = &cpi->fn_ptr[bsize];
+  ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
   ms_params->var_params.subpel_search_type =
       cpi->sf.mv_sf.use_accurate_subpel_search;
   ms_params->var_params.w = block_size_wide[bsize];
@@ -253,7 +253,7 @@ static INLINE int mv_cost(const MV *mv, const int *joint_cost,
 // nearest 2 ** 7.
 // This is NOT used during motion compensation.
 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
-                    int *mvcost[2], int weight) {
+                    int *const mvcost[2], int weight) {
   const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
   return ROUND_POWER_OF_TWO(
       mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
@@ -290,6 +290,9 @@ static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
 
 static INLINE int mv_err_cost_(const MV *mv,
                                const MV_COST_PARAMS *mv_cost_params) {
+  if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
+    return 0;
+  }
   return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
                      mv_cost_params->mvcost, mv_cost_params->error_per_bit,
                      mv_cost_params->mv_cost_type);
@@ -1830,7 +1833,7 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
       const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
                       GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
       if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
-                           cpi->common.seq_params.mib_size_log2))
+                           cpi->common.seq_params->mib_size_log2))
         continue;
 
       FULLPEL_MV hash_mv;
@@ -1957,8 +1960,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   if (xd->bd != 8) {
     unsigned int sad;
     best_int_mv->as_fullmv = kZeroFullMv;
-    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                 xd->plane[0].pre[0].buf, ref_stride);
+    sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
@@ -2001,7 +2004,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   FULLPEL_MV this_mv = best_int_mv->as_fullmv;
   src_buf = x->plane[0].src.buf;
   ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
-  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  best_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
 
   {
     const uint8_t *const pos[4] = {
@@ -2011,7 +2015,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
       ref_buf + ref_stride,
     };
 
-    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+    cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
+                                   this_sad);
   }
 
   for (idx = 0; idx < 4; ++idx) {
@@ -2034,7 +2039,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
 
   ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
 
-  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  tmp_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
   if (best_sad > tmp_sad) {
     best_int_mv->as_fullmv = this_mv;
     best_sad = tmp_sad;
@@ -2265,7 +2271,6 @@ static INLINE int get_subpel_part(int x) { return x & 7; }
 
 // Gets the address of the ref buffer at subpel location (r, c), rounded to the
 // nearest fullpel precision toward - \infty
-
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV mv) {
   const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
diff --git a/third_party/libaom/source/libaom/av1/encoder/mcomp.h b/third_party/libaom/source/libaom/av1/encoder/mcomp.h
index 901671e27f..b2539f5100 100644
--- a/third_party/libaom/source/libaom/av1/encoder/mcomp.h
+++ b/third_party/libaom/source/libaom/av1/encoder/mcomp.h
@@ -84,7 +84,7 @@ typedef struct {
 } MV_COST_PARAMS;
 
 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
-                    int *mvcost[2], int weight);
+                    int *const mvcost[2], int weight);
 
 int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
                        const FULLPEL_MV best_mv,
diff --git a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c
index 96b77b754d..07485bd68c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c
+++ b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c
@@ -15,6 +15,7 @@
 
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
@@ -41,7 +42,7 @@ static int compare_weight(const void *a, const void *b) {
 // Allow more mesh searches for screen content type on the ARF.
 static int use_fine_search_interval(const AV1_COMP *const cpi) {
   return cpi->is_screen_content_type &&
-         cpi->gf_group.update_type[cpi->gf_group.index] == ARF_UPDATE &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
          cpi->oxcf.speed <= 2;
 }
 
@@ -62,15 +63,15 @@ static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
   const int mi_col = xd->mi_col;
 
   const BLOCK_SIZE tpl_bsize =
-      convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   const int tplw = mi_size_wide[tpl_bsize];
   const int tplh = mi_size_high[tpl_bsize];
   const int nw = mi_size_wide[bsize] / tplw;
   const int nh = mi_size_high[bsize] / tplh;
 
   if (nw >= 1 && nh >= 1) {
-    const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-    const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+    const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size];
+    const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size];
     const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
     int valid = 1;
 
@@ -119,7 +120,8 @@ static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
-                              int_mv *best_mv) {
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
@@ -243,13 +245,9 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-  // Terminate search with the current ref_idx if we have already encountered
-  // another ref_mv in the drl such that:
-  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
-  //     search process as the current fullpel_mv.
-  //  2. The rate needed to encode the current fullpel_mv is larger than that
-  //     for the other ref_mv.
-  if (cpi->sf.inter_sf.skip_repeated_full_newmv &&
+  // Terminate search with the current ref_idx based on fullpel mv, rate cost,
+  // and other know cost.
+  if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 &&
       mbmi->motion_mode == SIMPLE_TRANSLATION &&
       best_mv->as_int != INVALID_MV) {
     int_mv this_mv;
@@ -260,6 +258,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                         mv_costs->mv_cost_stack, MV_COST_WEIGHT);
     mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
     mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+    mode_info[ref_mv_idx].full_mv_bestsme = bestsme;
 
     for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
       // Check if the motion search result same as previous results
@@ -280,6 +279,19 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           return;
         }
       }
+
+      // Terminate the evaluation of current ref_mv_idx based on bestsme and
+      // drl_cost.
+      const int psme = mode_info[prev_ref_idx].full_mv_bestsme;
+      if (psme == INT_MAX) continue;
+      const int thr =
+          cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme;
+      if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 &&
+          mode_info[ref_mv_idx].full_mv_bestsme > thr &&
+          mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) {
+        best_mv->as_int = INVALID_MV;
+        return;
+      }
     }
   }
 
@@ -289,6 +301,8 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   const int use_fractional_mv =
       bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  int best_mv_rate = 0;
+  int mv_rate_calculated = 0;
   if (use_fractional_mv) {
     int_mv fractional_ms_list[3];
     av1_set_fractional_mv(fractional_ms_list);
@@ -337,9 +351,10 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
             subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
             if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
                                          subpel_start_mv)) {
+              unsigned int sse;
               const int this_var = mv_search_params->find_fractional_mv_step(
                   xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
-                  &x->pred_sse[ref], fractional_ms_list);
+                  &sse, fractional_ms_list);
 
               if (!cpi->sf.mv_sf.disable_second_mv) {
                 // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
@@ -358,11 +373,17 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                 int64_t tmp_rd =
                     RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
                            tmp_rd_stats.dist);
-                if (tmp_rd < rd) best_mv->as_mv = this_best_mv;
+                if (tmp_rd < rd) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
               } else {
                 // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the
                 // best MV.
-                if (this_var < best_mv_var) best_mv->as_mv = this_best_mv;
+                if (this_var < best_mv_var) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
               }
             }
           }
@@ -379,9 +400,52 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
         break;
       default: assert(0 && "Invalid motion mode!\n");
     }
+
+    // Terminate search with the current ref_idx based on subpel mv and rate
+    // cost.
+    if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL &&
+        mbmi->motion_mode == SIMPLE_TRANSLATION &&
+        best_mv->as_int != INVALID_MV) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      best_mv_rate =
+          av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                          mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+      mv_rate_calculated = 1;
+
+      for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+        if (!args->single_newmv_valid[prev_ref_idx][ref]) continue;
+        // Check if the motion vectors are the same.
+        if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) {
+          // Skip this evaluation if the previous one is skipped.
+          if (mode_info[prev_ref_idx].skip) {
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+          // Compare the rate cost that we current know.
+          const int prev_rate_cost =
+              args->single_newmv_rate[prev_ref_idx][ref] +
+              mode_info[prev_ref_idx].drl_cost;
+          const int this_rate_cost =
+              best_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+          if (prev_rate_cost <= this_rate_cost) {
+            // If the current rate_cost is worse than the previous rate_cost,
+            // then we terminate the search for this ref_mv_idx.
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (mv_rate_calculated) {
+    *rate_mv = best_mv_rate;
+  } else {
+    *rate_mv =
+        av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
   }
-  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
-                             mv_costs->mv_cost_stack, MV_COST_WEIGHT);
 }
 
 int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -920,7 +984,7 @@ int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   const uint8_t *dst = xd->plane[0].dst.buf;
   const int dst_stride = xd->plane[0].dst.stride;
 
-  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+  *var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
 
   return best_mv;
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h
index 5736f2b756..bf81fe243a 100644
--- a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h
+++ b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h
@@ -21,20 +21,19 @@ extern "C" {
 // TODO(any): rename this struct to something else. There is already another
 // struct called inter_modes_info, which makes this terribly confusing.
 typedef struct {
-  int64_t rd;
   int drl_cost;
-
-  int rate_mv;
-  int_mv mv;
-
   int_mv full_search_mv;
   int full_mv_rate;
+  int full_mv_bestsme;
+  int skip;
 } inter_mode_info;
 
+struct HandleInterModeArgs;
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
-                              int_mv *best_mv);
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args);
 
 int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize, int_mv *cur_mv,
diff --git a/third_party/libaom/source/libaom/av1/encoder/mv_prec.c b/third_party/libaom/source/libaom/av1/encoder/mv_prec.c
index cc81d72170..ae9dc35af4 100644
--- a/third_party/libaom/source/libaom/av1/encoder/mv_prec.c
+++ b/third_party/libaom/source/libaom/av1/encoder/mv_prec.c
@@ -230,7 +230,7 @@ static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
   const int y_stride = cpi->source->y_stride;
   const int px_row = 4 * mi_row, px_col = 4 * mi_col;
   const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
-  const int bd = cm->seq_params.bit_depth;
+  const int bd = cm->seq_params->bit_depth;
   if (buf_is_hbd) {
     uint16_t *source_buf =
         CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
@@ -339,8 +339,8 @@ static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
   const int mi_row_end = tile_info->mi_row_end;
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
-  const int sb_size_mi = cm->seq_params.mib_size;
-  BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int sb_size_mi = cm->seq_params->mib_size;
+  BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
     for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
       collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
@@ -349,7 +349,7 @@ static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
 }
 
 void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
-  MV_STATS *mv_stats = &cpi->mv_stats;
+  MV_STATS *mv_stats = &cpi->ppi->mv_stats;
   const AV1_COMMON *cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
@@ -420,8 +420,8 @@ void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
   }
 #if !CONFIG_REALTIME_ONLY
   else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
-           av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) {
-    use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex);
+           av1_frame_allows_smart_mv(cpi) && cpi->ppi->mv_stats.valid) {
+    use_hp = get_smart_mv_prec(cpi, &cpi->ppi->mv_stats, qindex);
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/mv_prec.h b/third_party/libaom/source/libaom/av1/encoder/mv_prec.h
index 89f95f553e..11dcdd8806 100644
--- a/third_party/libaom/source/libaom/av1/encoder/mv_prec.h
+++ b/third_party/libaom/source/libaom/av1/encoder/mv_prec.h
@@ -21,8 +21,8 @@
 void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
 
 static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
-  const int gf_group_index = cpi->gf_group.index;
-  const int gf_update_type = cpi->gf_group.update_type[gf_group_index];
+  const int gf_group_index = cpi->gf_frame_index;
+  const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
   return !frame_is_intra_only(&cpi->common) &&
          !(gf_update_type == INTNL_OVERLAY_UPDATE ||
            gf_update_type == OVERLAY_UPDATE);
diff --git a/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c b/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c
index 279fd922dd..088135a2dd 100644
--- a/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c
+++ b/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c
@@ -353,6 +353,8 @@ static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
   (void)tile_data;
 
   x->pred_mv_sad[ref_frame] = INT_MAX;
+  x->pred_mv0_sad[ref_frame] = INT_MAX;
+  x->pred_mv1_sad[ref_frame] = INT_MAX;
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   // TODO(kyslov) this needs various further optimizations. to be continued..
   assert(yv12 != NULL);
@@ -518,7 +520,7 @@ static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   TX_SIZE tx_size;
   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
-    if (sse > (var << 2))
+    if (sse > (var << 1))
       tx_size =
           AOMMIN(max_txsize_lookup[bsize],
                  tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
@@ -729,9 +731,9 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
               (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
                                         i);
-          var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
-                                               puvd->dst.buf, puvd->dst.stride,
-                                               &sse_uv[j]);
+          var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(
+              puv->src.buf, puv->src.stride, puvd->dst.buf, puvd->dst.stride,
+              &sse_uv[j]);
           if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
               (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
             skip_uv[j] = 1;
@@ -776,8 +778,8 @@ static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   int rate;
   int64_t dist;
 
-  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
-                                           pd->dst.buf, pd->dst.stride, &sse);
+  unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
+      p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
   xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
 
   if (calculate_rd) {
@@ -1171,8 +1173,8 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
     unsigned int var;
     if (!x->color_sensitivity[i - 1]) continue;
 
-    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
-                             pd->dst.stride, &sse);
+    var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                                  pd->dst.stride, &sse);
     assert(sse >= var);
     tot_sse += sse;
 
@@ -1251,12 +1253,12 @@ static void estimate_block_intra(int plane, int block, int row, int col,
 
   (void)block;
 
-  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
-  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
-
   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
   av1_invalid_rd_stats(&this_rdc);
 
+  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
   if (plane == 0) {
     block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
               AOMMIN(tx_size, TX_16X16));
@@ -1562,7 +1564,7 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
     else
       model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
     pf_rd_stats[i].rate += av1_get_switchable_rate(
-        x, xd, cm->features.interp_filter, cm->seq_params.enable_dual_filter);
+        x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
     cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
     pf_tx_size[i] = mi->tx_size;
     if (cost < best_cost) {
@@ -1618,6 +1620,7 @@ typedef struct _mode_search_stat {
 static void compute_intra_yprediction(const AV1_COMMON *cm,
                                       PREDICTION_MODE mode, BLOCK_SIZE bsize,
                                       MACROBLOCK *x, MACROBLOCKD *xd) {
+  const SequenceHeader *seq_params = cm->seq_params;
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
   uint8_t *const src_buf_base = p->src.buf;
@@ -1644,10 +1647,11 @@ static void compute_intra_yprediction(const AV1_COMMON *cm,
     for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
       p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
       pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
-      av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                              block_size_high[bsize], tx_size, mode, 0, 0,
-                              FILTER_INTRA_MODES, pd->dst.buf, dst_stride,
-                              pd->dst.buf, dst_stride, 0, 0, plane);
+      av1_predict_intra_block(
+          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+          block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
+          FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
+          0, 0, plane);
     }
   }
   p->src.buf = src_buf_base;
@@ -1671,7 +1675,9 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
   const PREDICTION_MODE A = av1_above_block_mode(above_mi);
   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
-  bmode_costs = x->mode_costs.y_mode_costs[A][L];
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
 
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
@@ -1734,10 +1740,11 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
                                               int *force_skip_low_temp_var) {
   AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
 
   // For SVC the usage of alt_ref is determined by the ref_frame_flags.
-  int use_alt_ref_frame = cpi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
+  int use_alt_ref_frame =
+      cpi->ppi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
   int use_golden_ref_frame = 1;
 
   use_ref_frame[LAST_FRAME] = 1;  // we never skip LAST
@@ -1832,7 +1839,7 @@ static void estimate_intra_mode(
 
   int intra_cost_penalty = av1_get_intra_cost_penalty(
       quant_params->base_qindex, quant_params->y_dc_delta_q,
-      cm->seq_params.bit_depth);
+      cm->seq_params->bit_depth);
   int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
   int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
   // For spatial enhancemanent layer: turn off intra prediction if the
@@ -1851,8 +1858,8 @@ static void estimate_intra_mode(
   // Adjust thresholds to make intra mode likely tested if the other
   // references (golden, alt) are skipped/not checked. For now always
   // adjust for svc mode.
-  if (cpi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
-                       cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
+  if (cpi->ppi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
+                            cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
     spatial_var_thresh = 150;
     motion_thresh = 0;
   }
@@ -2063,6 +2070,40 @@ static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
   return 0;
 }
 
+void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                           BLOCK_SIZE bsize, int y_sad,
+                           unsigned int source_variance) {
+  const int factor = (bsize >= BLOCK_32X32) ? 2 : 3;
+  NOISE_LEVEL noise_level = kLow;
+  int norm_sad =
+      y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+  // If the spatial source variance is high and the normalized y_sad
+  // is low, then y-channel is likely good for mode estimation, so keep
+  // color_sensitivity off. For low noise content for now, since there is
+  // some bdrate regression for noisy color clip.
+  if (cpi->noise_estimate.enabled)
+    noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+  if (noise_level == kLow && source_variance > 1000 && norm_sad < 50) {
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+    return;
+  }
+  for (int i = 1; i <= 2; ++i) {
+    if (x->color_sensitivity[i - 1] == 2) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const BLOCK_SIZE bs =
+          get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+      const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                                  pd->dst.buf, pd->dst.stride);
+      const int norm_uv_sad =
+          uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
+      x->color_sensitivity[i - 1] =
+          uv_sad > (factor * (y_sad >> 3)) && norm_uv_sad > 40;
+    }
+  }
+}
+
 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                   MACROBLOCK *x, RD_STATS *rd_cost,
                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -2104,7 +2145,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
   PRED_BUFFER *this_mode_pred = NULL;
   const int reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
-                               cm->seq_params.bit_depth == AOM_BITS_8;
+                               cm->seq_params->bit_depth == AOM_BITS_8;
 
   const int bh = block_size_high[bsize];
   const int bw = block_size_wide[bsize];
@@ -2135,7 +2176,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
         cpi->common.height != cpi->resize_pending_params.height));
 
 #endif
-
+  x->color_sensitivity[0] = x->color_sensitivity_sb[0];
+  x->color_sensitivity[1] = x->color_sensitivity_sb[1];
   init_best_pickmode(&best_pickmode);
 
   const ModeCosts *mode_costs = &x->mode_costs;
@@ -2170,7 +2212,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
-    // if (cpi->use_svc) denoise_svc_pickmode = av1_denoise_svc_non_key(cpi);
+    // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+    // av1_denoise_svc_non_key(cpi);
     if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
       av1_denoiser_reset_frame_stats(ctx);
   }
@@ -2183,7 +2226,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   // to source, so use subpel motion vector to compensate. The nonzero motion
   // is half pixel shifted to left and top, so (-4, -4). This has more effect
   // on higher resolutins, so condition it on that for now.
-  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+  if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
       svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
       cm->width * cm->height > 640 * 480) {
     svc_mv_col = -4;
@@ -2210,7 +2253,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   const int use_model_yrd_large =
       cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
       !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
-      quant_params->base_qindex && cm->seq_params.bit_depth == 8;
+      quant_params->base_qindex && cm->seq_params->bit_depth == 8;
 
   const int enable_filter_search =
       is_filter_search_enabled(cpi, mi_row, mi_col, bsize);
@@ -2264,7 +2307,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     if (!use_ref_frame_mask[ref_frame]) continue;
 
     force_mv_inter_layer = 0;
-    if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+    if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
         ((ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
          (ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf))) {
       // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
@@ -2306,6 +2349,10 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
         if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) continue;
       }
     }
+    // Check for skipping NEARMV based on pred_mv_sad.
+    if (this_mode == NEARMV && x->pred_mv1_sad[ref_frame] != INT_MAX &&
+        x->pred_mv1_sad[ref_frame] > (x->pred_mv0_sad[ref_frame] << 1))
+      continue;
 
     if (skip_mode_by_threshold(
             this_mode, ref_frame, frame_mv[this_mode][ref_frame],
@@ -2357,6 +2404,22 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #if COLLECT_PICK_MODE_STAT
     ms_stat.num_nonskipped_searches[bsize][this_mode]++;
 #endif
+
+    if (idx == 0) {
+      // Set color sensitivity on first tested mode only.
+      // Use y-sad already computed in find_predictors: take the sad with motion
+      // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+      // is for zeromv.
+      int y_sad = x->pred_mv0_sad[LAST_FRAME];
+      if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+          (abs(frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+           abs(frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+              (abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+               abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+        y_sad = x->pred_mv1_sad[LAST_FRAME];
+      set_color_sensitivity(cpi, x, xd, bsize, y_sad, x->source_variance);
+    }
+
     if (enable_filter_search && !force_mv_inter_layer &&
         ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
         (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
diff --git a/third_party/libaom/source/libaom/av1/encoder/optical_flow.c b/third_party/libaom/source/libaom/av1/encoder/optical_flow.c
index 82ae9c5774..d2f03ed641 100644
--- a/third_party/libaom/source/libaom/av1/encoder/optical_flow.c
+++ b/third_party/libaom/source/libaom/av1/encoder/optical_flow.c
@@ -819,7 +819,7 @@ static void solve_horn_schunck(const double *ix, const double *iy,
   }
   av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
                       2 * width * height, &A);
-  // substract init mv part from b
+  // subtract init mv part from b
   av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height);
   for (int i = 0; i < 2 * width * height; i++) {
     b[i] = -temp_b[i];
@@ -882,10 +882,11 @@ static void solve_horn_schunck(const double *ix, const double *iy,
 }
 
 // Calculate optical flow from from_frame to to_frame using the H-S method.
-void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
-                  const YV12_BUFFER_CONFIG *to_frame, const int level,
-                  const int mv_stride, const int mv_height, const int mv_width,
-                  const OPFL_PARAMS *opfl_params, LOCALMV *mvs) {
+static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
+                         const YV12_BUFFER_CONFIG *to_frame, const int level,
+                         const int mv_stride, const int mv_height,
+                         const int mv_width, const OPFL_PARAMS *opfl_params,
+                         LOCALMV *mvs) {
   // mvs are always on level 0, here we define two new mv arrays that is of size
   // of this level.
   const int fw = from_frame->y_crop_width;
diff --git a/third_party/libaom/source/libaom/av1/encoder/palette.c b/third_party/libaom/source/libaom/av1/encoder/palette.c
index fd579b7f7f..fbc16ca742 100644
--- a/third_party/libaom/source/libaom/av1/encoder/palette.c
+++ b/third_party/libaom/source/libaom/av1/encoder/palette.c
@@ -218,12 +218,12 @@ static AOM_INLINE void palette_rd_y(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
     uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
-    uint8_t *tx_type_map, int *beat_best_palette_rd) {
+    uint8_t *best_palette_color_map, int64_t *best_rd, int *rate,
+    int *rate_tokenonly, int64_t *distortion, int *skippable, int *beat_best_rd,
+    PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, uint8_t *tx_type_map,
+    int *beat_best_palette_rd) {
   optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
-                          cpi->common.seq_params.bit_depth);
+                          cpi->common.seq_params->bit_depth);
   const int num_unique_colors = av1_remove_duplicates(centroids, n);
   if (num_unique_colors < PALETTE_MIN_SIZE) {
     // Too few unique colors to create a palette. And DC_PRED will work
@@ -231,10 +231,10 @@ static AOM_INLINE void palette_rd_y(
     return;
   }
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth) {
+  if (cpi->common.seq_params->use_highbitdepth) {
     for (int i = 0; i < num_unique_colors; ++i) {
       pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
+          (int)centroids[i], cpi->common.seq_params->bit_depth);
     }
   } else {
     for (int i = 0; i < num_unique_colors; ++i) {
@@ -251,10 +251,6 @@ static AOM_INLINE void palette_rd_y(
                    1);
   extend_palette_color_map(color_map, cols, rows, block_width, block_height);
 
-  if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
-    return;
-  }
-
   RD_STATS tokenonly_rd_stats;
   av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
                                     *best_rd);
@@ -304,10 +300,9 @@ static AOM_INLINE int perform_top_color_palette_search(
     BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
     int start_n, int end_n, int step_size, int *last_n_searched,
     uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map) {
+    uint8_t *best_palette_color_map, int64_t *best_rd, int *rate,
+    int *rate_tokenonly, int64_t *distortion, int *skippable, int *beat_best_rd,
+    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) {
   int centroids[PALETTE_MAX_SIZE];
   int n = start_n;
   int top_color_winner = end_n;
@@ -320,8 +315,8 @@ static AOM_INLINE int perform_top_color_palette_search(
     memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
     palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
                  color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 best_rd, rate, rate_tokenonly, distortion, skippable,
+                 beat_best_rd, ctx, best_blk_skip, tx_type_map,
                  &beat_best_palette_rd);
     *last_n_searched = n;
     if (beat_best_palette_rd) {
@@ -345,10 +340,9 @@ static AOM_INLINE int perform_k_means_palette_search(
     int upper_bound, int start_n, int end_n, int step_size,
     int *last_n_searched, uint16_t *color_cache, int n_cache,
     MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map, uint8_t *color_map, int data_points) {
   int centroids[PALETTE_MAX_SIZE];
   const int max_itr = 50;
   int n = start_n;
@@ -366,8 +360,8 @@ static AOM_INLINE int perform_k_means_palette_search(
     av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
     palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
                  color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 best_rd, rate, rate_tokenonly, distortion, skippable,
+                 beat_best_rd, ctx, best_blk_skip, tx_type_map,
                  &beat_best_palette_rd);
     *last_n_searched = n;
     if (beat_best_palette_rd) {
@@ -434,9 +428,9 @@ static AOM_INLINE void fill_data_and_get_bounds(
 void av1_rd_pick_palette_intra_sby(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
     MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
@@ -450,7 +444,7 @@ void av1_rd_pick_palette_intra_sby(
   int block_width, block_height, rows, cols;
   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
                            &cols);
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
   const int is_hbd = seq_params->use_highbitdepth;
   const int bit_depth = seq_params->bit_depth;
   int unused;
@@ -532,8 +526,8 @@ void av1_rd_pick_palette_intra_sby(
       const int top_color_winner = perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
           step_size, &unused, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+          best_palette_color_map, best_rd, rate, rate_tokenonly, distortion,
+          skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
       // Evaluate neighbors for the winner color (if winner is found) in the
       // above coarse search for dominant colors
       if (top_color_winner <= max_n) {
@@ -544,18 +538,18 @@ void av1_rd_pick_palette_intra_sby(
         perform_top_color_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
             stage2_max_n + 1, stage2_step_size, &unused, color_cache, n_cache,
-            best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map);
       }
       // K-means clustering.
       // Perform k-means coarse palette search to find the winner candidate
       const int k_means_winner = perform_k_means_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
           min_n, max_n + 1, step_size, &unused, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-          color_map, rows * cols);
+          best_palette_color_map, best_rd, rate, rate_tokenonly, distortion,
+          skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+          rows * cols);
       // Evaluate neighbors for the winner color (if winner is found) in the
       // above coarse search for k-means
       if (k_means_winner <= max_n) {
@@ -567,9 +561,8 @@ void av1_rd_pick_palette_intra_sby(
             cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
             start_n_stage2, end_n_stage2 + 1, step_size_stage2, &unused,
             color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
-            best_model_rd, rate, rate_tokenonly, distortion, skippable,
-            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-            rows * cols);
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols);
       }
     } else {
       const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
@@ -579,17 +572,16 @@ void av1_rd_pick_palette_intra_sby(
       perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, min_n - 1,
           -1, &last_n_searched, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+          best_palette_color_map, best_rd, rate, rate_tokenonly, distortion,
+          skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
 
       if (last_n_searched > min_n) {
         // Search in ascending order until we get to the previous best
         perform_top_color_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n,
             last_n_searched, 1, &unused, color_cache, n_cache, best_mbmi,
-            best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
+            best_palette_color_map, best_rd, rate, rate_tokenonly, distortion,
+            skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
       }
       // K-means clustering.
       if (colors == PALETTE_MIN_SIZE) {
@@ -599,26 +591,25 @@ void av1_rd_pick_palette_intra_sby(
         centroids[1] = upper_bound;
         palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
                      color_cache, n_cache, best_mbmi, best_palette_color_map,
-                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                     NULL);
+                     best_rd, rate, rate_tokenonly, distortion, skippable,
+                     beat_best_rd, ctx, best_blk_skip, tx_type_map, NULL);
       } else {
         // Perform k-means palette search in descending order
         last_n_searched = max_n;
         perform_k_means_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
             max_n, min_n - 1, -1, &last_n_searched, color_cache, n_cache,
-            best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map, color_map, rows * cols);
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map, color_map, rows * cols);
         if (last_n_searched > min_n) {
           // Search in ascending order until we get to the previous best
           perform_k_means_palette_search(
               cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
               min_n, last_n_searched, 1, &unused, color_cache, n_cache,
-              best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
-              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-              best_blk_skip, tx_type_map, color_map, rows * cols);
+              best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+              distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+              tx_type_map, color_map, rows * cols);
         }
       }
     }
@@ -645,7 +636,7 @@ void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
                            mbmi->bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->bsize;
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
   int this_rate;
   int64_t this_rd;
   int colors_u, colors_v, colors;
@@ -737,7 +728,7 @@ void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
       }
       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
       optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
-                              cpi->common.seq_params.bit_depth);
+                              cpi->common.seq_params->bit_depth);
       // Sort the U channel colors in ascending order.
       for (i = 0; i < 2 * (n - 1); i += 2) {
         int min_idx = i;
@@ -811,7 +802,7 @@ void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
 
   for (r = 0; r < rows; ++r) {
     for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
         data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
       } else {
diff --git a/third_party/libaom/source/libaom/av1/encoder/palette.h b/third_party/libaom/source/libaom/av1/encoder/palette.h
index 85af473892..7d9a72f61d 100644
--- a/third_party/libaom/source/libaom/av1/encoder/palette.h
+++ b/third_party/libaom/source/libaom/av1/encoder/palette.h
@@ -185,10 +185,9 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
 void av1_rd_pick_palette_intra_sby(
     const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
     int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
-    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, int *beat_best_rd,
-    struct PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map);
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map);
 
 /*!\brief Search for the best palette in the chroma plane.
  *
diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_search.c b/third_party/libaom/source/libaom/av1/encoder/partition_search.c
index 5d54a80b36..c5bfaf684f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/partition_search.c
+++ b/third_party/libaom/source/libaom/av1/encoder/partition_search.c
@@ -25,6 +25,7 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/var_based_part.h"
@@ -34,6 +35,48 @@
 #include "av1/encoder/tune_vmaf.h"
 #endif
 
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+  part_sf->partition_search_type = SEARCH_PARTITION;
+  part_sf->less_rectangular_check_level = 0;
+  part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+  part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  part_sf->default_max_partition_size = BLOCK_LARGEST;
+  part_sf->default_min_partition_size = BLOCK_4X4;
+  part_sf->adjust_var_based_rd_partitioning = 0;
+  part_sf->allow_partition_search_skip = 0;
+  part_sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  part_sf->fixed_partition_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  part_sf->partition_search_breakout_dist_thr = 0;
+  part_sf->partition_search_breakout_rate_thr = 0;
+  part_sf->prune_ext_partition_types_search_level = 0;
+  part_sf->prune_part4_search = 0;
+  part_sf->ml_prune_partition = 0;
+  part_sf->ml_early_term_after_part_split_level = 0;
+  for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+    part_sf->ml_partition_search_breakout_thresh[i] =
+        -1;  // -1 means not enabled.
+  }
+  part_sf->simple_motion_search_prune_agg = 0;
+  part_sf->simple_motion_search_split = 0;
+  part_sf->simple_motion_search_prune_rect = 0;
+  part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->simple_motion_search_reduce_search_steps = 0;
+  part_sf->intra_cnn_split = 0;
+  part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->prune_ext_part_using_split_info = 0;
+  part_sf->prune_rectangular_split_based_on_qidx = 0;
+  part_sf->early_term_after_none_split = 0;
+  part_sf->ml_predict_breakout_level = 0;
+  part_sf->prune_sub_8x8_partition_level = 0;
+  part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+}
+
 static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
                               FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
                               int blk_row, int blk_col,
@@ -151,11 +194,14 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
         set_txfm_context(xd, sub_txs, offsetr, offsetc);
       }
     }
@@ -281,7 +327,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                            xd->block_ref_scale_factors[ref], num_planes);
     }
     const int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
-                             cm->seq_params.bit_depth == AOM_BITS_8)
+                             cm->seq_params->bit_depth == AOM_BITS_8)
                                 ? 1
                                 : 0;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
@@ -395,8 +441,8 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   if (!dry_run) {
     if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
         cpi->sf.rt_sf.use_temporal_noise_estimate &&
-        (!cpi->use_svc ||
-         (cpi->use_svc &&
+        (!cpi->ppi->use_svc ||
+         (cpi->ppi->use_svc &&
           !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
       update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
@@ -590,7 +636,7 @@ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
                           RD_STATS *rd_cost, PARTITION_TYPE partition,
                           BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                           RD_STATS best_rd) {
-  if (best_rd.rdcost < 0) {
+  if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) {
     ctx->rd_stats.rdcost = INT64_MAX;
     ctx->rd_stats.skip_txfm = 0;
     av1_invalid_rd_stats(rd_cost);
@@ -599,7 +645,8 @@ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
 
   av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
 
-  if (ctx->rd_mode_is_ready) {
+  if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab &&
+      ctx->rd_mode_is_ready) {
     assert(ctx->mic.bsize == bsize);
     assert(ctx->mic.partition == partition);
     rd_cost->rate = ctx->rd_stats.rate;
@@ -672,6 +719,13 @@ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
   av1_set_error_per_bit(&x->errorperbit, x->rdmult);
   av1_rd_cost_update(x->rdmult, &best_rd);
 
+  // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous
+  // rdcost information for the following mode search.
+  // Disabling the feature could get some coding gain, with encoder slowdown.
+  if (!cpi->sf.part_sf.use_best_rd_for_pruning) {
+    av1_invalid_rd_stats(&best_rd);
+  }
+
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
@@ -750,11 +804,11 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
 #if CONFIG_ENTROPY_STATS
   // delta quant applies to both intra and inter
   const int super_block_upper_left =
-      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+      ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   if (delta_q_info->delta_q_present_flag &&
-      (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) &&
+      (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
       super_block_upper_left) {
     const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
                    delta_q_info->delta_q_res;
@@ -798,10 +852,16 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
   }
 
   if (av1_allow_intrabc(cm)) {
-    update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+    const int is_intrabc = is_intrabc_block(mbmi);
+    update_cdf(fc->intrabc_cdf, is_intrabc, 2);
 #if CONFIG_ENTROPY_STATS
-    ++td->counts->intrabc[is_intrabc_block(mbmi)];
+    ++td->counts->intrabc[is_intrabc];
 #endif  // CONFIG_ENTROPY_STATS
+    if (is_intrabc) {
+      const int_mv dv_ref = x->mbmi_ext_frame->ref_mv_stack[0].this_mv;
+      av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc,
+                          MV_SUBPEL_NONE);
+    }
   }
 
   if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
@@ -947,7 +1007,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
         }
       }
 
-      if (cm->seq_params.enable_interintra_compound &&
+      if (cm->seq_params->enable_interintra_compound &&
           is_interintra_allowed(mbmi)) {
         const int bsize_group = size_group_lookup[bsize];
         if (mbmi->ref_frame[1] == INTRA_FRAME) {
@@ -1008,7 +1068,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
                mbmi->motion_mode == SIMPLE_TRANSLATION);
 
         const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                         cm->seq_params.enable_masked_compound;
+                                         cm->seq_params->enable_masked_compound;
         if (masked_compound_used) {
           const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
 #if CONFIG_ENTROPY_STATS
@@ -1053,7 +1113,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
   if (inter_block && cm->features.interp_filter == SWITCHABLE &&
       mbmi->motion_mode != WARPED_CAUSAL &&
       !is_nontrans_global_motion(xd, mbmi)) {
-    update_filter_type_cdf(xd, mbmi, cm->seq_params.enable_dual_filter);
+    update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter);
   }
   if (inter_block &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -1160,8 +1220,8 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  const int subsampling_x = cm->seq_params.subsampling_x;
-  const int subsampling_y = cm->seq_params.subsampling_y;
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
 
   av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
   const int origin_mult = x->rdmult;
@@ -1174,9 +1234,9 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
     set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
                    x->cb_offset[PLANE_TYPE_UV]);
     assert(x->cb_offset[PLANE_TYPE_Y] <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
     assert(x->cb_offset[PLANE_TYPE_UV] <
-           ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >>
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
             (subsampling_x + subsampling_y)));
   }
 
@@ -1184,7 +1244,7 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 
   if (!dry_run) {
     update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
-    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip_txfm == 1 &&
+    if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 &&
         cm->delta_q_info.delta_lf_present_flag) {
       const int frame_lf_count =
           av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
@@ -1202,11 +1262,11 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 
     // delta quant applies to both intra and inter
     const int super_block_upper_left =
-        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+        ((mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params->mib_size - 1)) == 0);
     const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
     if (delta_q_info->delta_q_present_flag &&
-        (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) &&
+        (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
         super_block_upper_left) {
       xd->current_base_qindex = mbmi->current_qindex;
       if (delta_q_info->delta_lf_present_flag) {
@@ -1753,11 +1813,11 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == cm->seq_params.sb_size)
+  if (bsize == cm->seq_params->sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
+    if (bsize == cm->seq_params->sb_size) {
       // NOTE: To get estimate for rate due to the tokens, use:
       // int rate_coeffs = 0;
       // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
@@ -1792,15 +1852,15 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   // Nonrd pickmode does not currently support second/combined reference.
   assert(!has_second_ref(mbmi));
   av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-  const int subsampling_x = cpi->common.seq_params.subsampling_x;
-  const int subsampling_y = cpi->common.seq_params.subsampling_y;
+  const int subsampling_x = cpi->common.seq_params->subsampling_x;
+  const int subsampling_y = cpi->common.seq_params->subsampling_y;
   if (!dry_run) {
     set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
                    x->cb_offset[PLANE_TYPE_UV]);
     assert(x->cb_offset[PLANE_TYPE_Y] <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
     assert(x->cb_offset[PLANE_TYPE_UV] <
-           ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >>
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
             (subsampling_x + subsampling_y)));
   }
   encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
@@ -1808,6 +1868,8 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
     update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
     if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
   }
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm)
+    av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize);
   // TODO(Ravi/Remya): Move this copy function to a better logical place
   // This function will copy the best mode information from block
   // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
@@ -1889,8 +1951,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
   int i;
 
   wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
-                        &tile_data->tile_info, cm->seq_params.sb_size,
-                        cm->seq_params.mib_size_log2, bsize, mi_row, mi_col);
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, rd_pick_sb_modes_time);
@@ -1947,6 +2009,30 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
     end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
 #endif
   }
+  if (cpi->sf.rt_sf.skip_cdef_sb) {
+    // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
+    // the block size.
+    const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
+    const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64;
+    MB_MODE_INFO **mi_sb =
+        cm->mi_params.mi_grid_base +
+        get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
+    // Do not skip if intra or new mv is picked.
+    const int skip = mi_sb[0]->skip_cdef_curr_sb &&
+                     !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+    // "blocks".
+    const int block64_in_sb = (bsize == BLOCK_128X128) ? 2 : 1;
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi_sb[idx_in_sb]) mi_sb[idx_in_sb]->skip_cdef_curr_sb = skip;
+      }
+    }
+    // Store in the pickmode context.
+    ctx->mic.skip_cdef_curr_sb = mi_sb[0]->skip_cdef_curr_sb;
+  }
   x->rdmult = orig_rdmult;
   ctx->rd_stats.rate = rd_cost->rate;
   ctx->rd_stats.dist = rd_cost->dist;
@@ -2301,15 +2387,15 @@ static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
   // Loop over sub-partitions in AB partition type.
   for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
     if (mode_cache && mode_cache[i]) {
-      x->use_intermode_cache = 1;
-      x->intermode_cache = mode_cache[i];
+      x->use_mb_mode_cache = 1;
+      x->mb_mode_cache = mode_cache[i];
     }
     const int mode_search_success =
         rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
                         ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
                         *best_rdc, &sum_rdc, partition, ctxs[i]);
-    x->use_intermode_cache = 0;
-    x->intermode_cache = NULL;
+    x->use_mb_mode_cache = 0;
+    x->mb_mode_cache = NULL;
     if (!mode_search_success) {
       return false;
     }
@@ -2629,7 +2715,8 @@ static void rectangular_partition_search(
     TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
     RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
     PartitionSearchState *part_search_state, RD_STATS *best_rdc,
-    RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+    RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type,
+    const RECT_PART_TYPE end_type) {
   const AV1_COMMON *const cm = &cpi->common;
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
   RD_STATS *sum_rdc = &part_search_state->sum_rdc;
@@ -2663,7 +2750,7 @@ static void rectangular_partition_search(
   };
 
   // Loop over rectangular partition types.
-  for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+  for (RECT_PART_TYPE i = start_type; i <= end_type; i++) {
     assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
                    !part_search_state->partition_rect_allowed[i]));
 
@@ -2879,7 +2966,8 @@ static void ab_partitions_search(
     TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
     PC_TREE *pc_tree, PartitionSearchState *part_search_state,
     RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info,
-    int pb_source_variance, int ext_partition_allowed) {
+    int pb_source_variance, int ext_partition_allowed,
+    const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) {
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
   const int mi_row = blk_params.mi_row;
   const int mi_col = blk_params.mi_col;
@@ -2888,9 +2976,9 @@ static void ab_partitions_search(
   int ab_partitions_allowed[NUM_AB_PARTS] = { 1, 1, 1, 1 };
   // Prune AB partitions
   av1_prune_ab_partitions(
-      cpi, x, pc_tree, bsize, pb_source_variance, best_rdc->rdcost,
-      part_search_state->rect_part_rd, part_search_state->split_rd,
-      rect_part_win_info, ext_partition_allowed,
+      cpi, x, pc_tree, bsize, mi_row, mi_col, pb_source_variance,
+      best_rdc->rdcost, part_search_state->rect_part_rd,
+      part_search_state->split_rd, rect_part_win_info, ext_partition_allowed,
       part_search_state->partition_rect_allowed[HORZ],
       part_search_state->partition_rect_allowed[VERT],
       &ab_partitions_allowed[HORZ_A], &ab_partitions_allowed[HORZ_B],
@@ -2946,7 +3034,7 @@ static void ab_partitions_search(
   };
 
   // Loop over AB partition types.
-  for (AB_PART_TYPE ab_part_type = 0; ab_part_type < NUM_AB_PARTS;
+  for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type;
        ab_part_type++) {
     const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
 
@@ -2956,33 +3044,35 @@ static void ab_partitions_search(
       continue;
 
     blk_params.subsize = get_partition_subsize(bsize, part_type);
-    for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
-      // Set AB partition context.
-      cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
-          cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
-      // Set mode as not ready.
-      cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
-    }
+    if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) {
+      for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+        // Set AB partition context.
+        cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
+            cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+        // Set mode as not ready.
+        cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
+      }
 
-    // We can copy directly the mode search results if we have already searched
-    // the current block and the contexts match.
-    if (is_ctx_ready[ab_part_type][0]) {
-      av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
-                            mode_srch_ctx[ab_part_type][0][0]);
-      cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
-      cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
-      if (is_ctx_ready[ab_part_type][1]) {
-        av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
-                              mode_srch_ctx[ab_part_type][1][0]);
-        cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
-        cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+      // We can copy directly the mode search results if we have already
+      // searched the current block and the contexts match.
+      if (is_ctx_ready[ab_part_type][0]) {
+        av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+                              mode_srch_ctx[ab_part_type][0][0]);
+        cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+        cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+        if (is_ctx_ready[ab_part_type][1]) {
+          av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+                                mode_srch_ctx[ab_part_type][1][0]);
+          cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+          cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+        }
       }
     }
 
     // Even if the contexts don't match, we can still speed up by reusing the
     // previous prediction mode.
     const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL };
-    if (cpi->sf.inter_sf.reuse_best_prediction_for_part_ab) {
+    if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) {
       set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
     }
 
@@ -3180,21 +3270,6 @@ static void prune_4_way_partition_search(
                                      part4_search_allowed);
 }
 
-// Set PARTITION_NONE allowed flag.
-static AOM_INLINE void set_part_none_allowed_flag(
-    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
-  PartitionBlkParams blk_params = part_search_state->part_blk_params;
-  if ((blk_params.width <= blk_params.min_partition_size_1d) &&
-      blk_params.has_rows && blk_params.has_cols)
-    part_search_state->partition_none_allowed = 1;
-  assert(part_search_state->terminate_partition_search == 0);
-
-  // Set PARTITION_NONE for screen content.
-  if (cpi->use_screen_content_tools)
-    part_search_state->partition_none_allowed =
-        blk_params.has_rows && blk_params.has_cols;
-}
-
 // Set params needed for PARTITION_NONE search.
 static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
                                       MACROBLOCK *x, PC_TREE *pc_tree,
@@ -3247,11 +3322,10 @@ static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x,
         bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
         bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
     if (use_ml_based_breakout) {
-      if (av1_ml_predict_breakout(cpi, bsize, x, this_rdc, *pb_source_variance,
-                                  xd->bd)) {
-        part_search_state->do_square_split = 0;
-        part_search_state->do_rectangular_split = 0;
-      }
+      av1_ml_predict_breakout(cpi, bsize, x, this_rdc, blk_params,
+                              *pb_source_variance, xd->bd,
+                              &part_search_state->do_square_split,
+                              &part_search_state->do_rectangular_split);
     }
 
     // Adjust dist breakout threshold according to the partition size.
@@ -3329,10 +3403,11 @@ static void prune_partitions_after_split(
       !part_search_state->terminate_partition_search) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
                          bsize);
-    av1_ml_prune_rect_partition(
-        cpi, x, bsize, best_rdc->rdcost, part_search_state->none_rd,
-        part_search_state->split_rd, &part_search_state->prune_rect_part[HORZ],
-        &part_search_state->prune_rect_part[VERT]);
+    av1_ml_prune_rect_partition(cpi, x, bsize, mi_row, mi_col, best_rdc->rdcost,
+                                part_search_state->none_rd,
+                                part_search_state->split_rd,
+                                &part_search_state->prune_rect_part[HORZ],
+                                &part_search_state->prune_rect_part[VERT]);
   }
 }
 
@@ -3351,12 +3426,11 @@ static void none_partition_search(
   const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
 
-  // Set PARTITION_NONE allowed flag.
-  set_part_none_allowed_flag(cpi, part_search_state);
   if (!part_search_state->partition_none_allowed) return;
 
   int pt_cost = 0;
   RD_STATS best_remain_rdcost;
+  av1_invalid_rd_stats(&best_remain_rdcost);
 
   // Set PARTITION_NONE context and cost.
   set_none_partition_params(cpi, td, x, pc_tree, part_search_state,
@@ -3402,7 +3476,7 @@ static void none_partition_search(
     if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
       const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
       av1_update_picked_ref_frames_mask(
-          x, ref_type, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+          x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col);
     }
 
     // Calculate the total cost and update the best partition.
@@ -3553,6 +3627,376 @@ static void split_partition_search(
   av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
 }
 
+// The max number of nodes in the partition tree.
+// The number of leaf nodes is (128x128) / (4x4) = 1024.
+// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023.
+#define NUM_NODES 2048
+
+static void write_partition_tree(AV1_COMP *const cpi,
+                                 const PC_TREE *const pc_tree,
+                                 const BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+           cpi->sb_counter, 0);
+  ++cpi->sb_counter;
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d", bsize);
+
+  // Write partition type with BFS order.
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int q_idx = 0;
+  int depth = 0;
+  int last_idx = 1;
+  int num_nodes = 1;
+
+  // First traversal to get number of leaf nodes and depth.
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      ++depth;
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  const int num_leafs = last_idx;
+  fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+  // Write partitions for each node.
+  q_idx = 0;
+  depth = 0;
+  last_idx = 1;
+  num_nodes = 1;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    fprintf(pfile, ",%d", node->partitioning);
+    if (node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      ++depth;
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fprintf(pfile, "\n");
+
+  fclose(pfile);
+}
+
+static void verify_write_partition_tree(const AV1_COMP *const cpi,
+                                        const PC_TREE *const pc_tree,
+                                        const BLOCK_SIZE bsize,
+                                        const int config_id, const int mi_row,
+                                        const int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d",
+           path, cpi->sb_counter, config_id);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d", bsize);
+
+  // Write partition type with BFS order.
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int q_idx = 0;
+  int depth = 0;
+  int last_idx = 1;
+  int num_nodes = 1;
+
+  // First traversal to get number of leaf nodes and depth.
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL && node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      ++depth;
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  const int num_leafs = last_idx;
+  fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+  // Write partitions for each node.
+  q_idx = 0;
+  depth = 0;
+  last_idx = 1;
+  num_nodes = 1;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {  // suppress warning
+      fprintf(pfile, ",%d", node->partitioning);
+      if (node->partitioning == PARTITION_SPLIT) {
+        for (int i = 0; i < 4; ++i) {
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+        ++depth;
+        num_nodes += 4;
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fprintf(pfile, "\n");
+
+  fclose(pfile);
+}
+
+static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+                               const int config_id) {
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+           cpi->sb_counter, config_id);
+  FILE *pfile = fopen(filename, "r");
+  if (pfile == NULL) {
+    printf("Can't find the file: %s\n", filename);
+    exit(0);
+  }
+
+  int read_bsize;
+  int num_nodes;
+  int num_configs;
+  fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
+  assert(read_bsize == cpi->common.seq_params->sb_size);
+  BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int last_idx = 1;
+  int q_idx = 0;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    int partitioning;
+    fscanf(pfile, ",%d", &partitioning);
+    assert(partitioning >= PARTITION_NONE &&
+           partitioning < EXT_PARTITION_TYPES);
+    PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) node->partitioning = partitioning;
+    if (partitioning == PARTITION_SPLIT) {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      for (int i = 0; i < 4; ++i) {
+        if (node != NULL) {  // Suppress warning
+          node->split[i] = av1_alloc_pc_tree_node(subsize);
+          node->split[i]->index = i;
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+      }
+      bsize = subsize;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fclose(pfile);
+
+  return num_configs;
+}
+
+static RD_STATS rd_search_for_fixed_partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col,
+    const BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  RD_STATS best_rdc;
+  av1_invalid_rd_stats(&best_rdc);
+  int sum_subblock_rate = 0;
+  int64_t sum_subblock_dist = 0;
+  PartitionSearchState part_search_state;
+  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+                                     bsize);
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+  if (!(blk_params.has_rows && blk_params.has_cols))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  (void)orig_rdmult;
+
+  // Set the context.
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  assert(bsize < BLOCK_SIZES_ALL);
+  unsigned int pb_source_variance = UINT_MAX;
+  int64_t part_none_rd = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
+  int inc_step[NUM_PART4_TYPES] = { 0 };
+  if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4;
+  if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                            &part_search_state, &best_rdc, &pb_source_variance,
+                            &none_rd, &part_none_rd);
+      break;
+    case PARTITION_HORZ:
+      rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                                   &part_search_state, &best_rdc, NULL, HORZ,
+                                   HORZ);
+      break;
+    case PARTITION_VERT:
+      rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                                   &part_search_state, &best_rdc, NULL, VERT,
+                                   VERT);
+      break;
+    case PARTITION_HORZ_A:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, HORZ_A, HORZ_A);
+      break;
+    case PARTITION_HORZ_B:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, HORZ_B, HORZ_B);
+      break;
+    case PARTITION_VERT_A:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, VERT_A, VERT_A);
+      break;
+    case PARTITION_VERT_B:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, VERT_B, VERT_B);
+      break;
+    case PARTITION_HORZ_4:
+      rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                         pc_tree->horizontal4, &part_search_state, &best_rdc,
+                         inc_step, PARTITION_HORZ_4);
+      break;
+    case PARTITION_VERT_4:
+      rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                         pc_tree->vertical4, &part_search_state, &best_rdc,
+                         inc_step, PARTITION_VERT_4);
+      break;
+    case PARTITION_SPLIT:
+      for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) {
+        const BLOCK_SIZE subsize =
+            get_partition_subsize(bsize, PARTITION_SPLIT);
+        assert(subsize < BLOCK_SIZES_ALL);
+        const int next_mi_row =
+            idx < 2 ? mi_row : mi_row + mi_size_high[subsize];
+        const int next_mi_col =
+            idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize];
+        if (next_mi_row >= cm->mi_params.mi_rows ||
+            next_mi_col >= cm->mi_params.mi_cols) {
+          continue;
+        }
+        const RD_STATS subblock_rdc = rd_search_for_fixed_partition(
+            cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row,
+            next_mi_col, subsize, pc_tree->split[idx]);
+        sum_subblock_rate += subblock_rdc.rate;
+        sum_subblock_dist += subblock_rdc.dist;
+      }
+      best_rdc.rate = sum_subblock_rate;
+      best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT];
+      best_rdc.dist = sum_subblock_dist;
+      best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist);
+      break;
+    default: assert(0 && "invalid partition type."); exit(0);
+  }
+  // Note: it is necessary to restore context information.
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize != cm->seq_params->sb_size) {
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
+  }
+  x->rdmult = orig_rdmult;
+
+  return best_rdc;
+}
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TokenExtra **tp,
+                             SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+                             int mi_col, const BLOCK_SIZE bsize,
+                             RD_STATS *best_rd_cost) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  int best_idx = 0;
+  int64_t min_rdcost = INT64_MAX;
+  int num_configs;
+  RD_STATS *rdcost = NULL;
+  int i = 0;
+  do {
+    PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
+    num_configs = read_partition_tree(cpi, pc_tree, i);
+    if (i == 0) {
+      rdcost = aom_calloc(num_configs, sizeof(*rdcost));
+    }
+    if (num_configs <= 0) {
+      av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+      if (rdcost != NULL) aom_free(rdcost);
+      exit(0);
+      return false;
+    }
+    verify_write_partition_tree(cpi, pc_tree, bsize, i, mi_row, mi_col);
+    // Encode the block with the given partition tree. Get rdcost and encoding
+    // time.
+    rdcost[i] = rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root,
+                                              mi_row, mi_col, bsize, pc_tree);
+
+    if (rdcost[i].rdcost < min_rdcost) {
+      min_rdcost = rdcost[i].rdcost;
+      best_idx = i;
+      *best_rd_cost = rdcost[i];
+    }
+    av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+    ++i;
+  } while (i < num_configs);
+
+  // Encode with the partition configuration with the smallest rdcost.
+  PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
+  read_partition_tree(cpi, pc_tree, best_idx);
+  rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
+                                mi_col, bsize, pc_tree);
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            pc_tree, NULL);
+
+  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+  aom_free(rdcost);
+  ++cpi->sb_counter;
+
+  return true;
+}
+
 /*!\brief AV1 block partition search (full search).
 *
 * \ingroup partition_search
@@ -3617,7 +4061,7 @@ bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
     av1_invalid_rd_stats(rd_cost);
     return part_search_state.found_best_partition;
   }
-  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
+  if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks.
   if (none_rd) *none_rd = 0;
@@ -3742,7 +4186,7 @@ BEGIN_PARTITION_SEARCH:
   // when NONE and SPLIT partition rd_costs are INT64_MAX.
   if (cpi->sf.part_sf.early_term_after_none_split &&
       part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
-      !x->must_find_valid_partition && (bsize != cm->seq_params.sb_size)) {
+      !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) {
     part_search_state.terminate_partition_search = 1;
   }
 
@@ -3755,7 +4199,7 @@ BEGIN_PARTITION_SEARCH:
   // Rectangular partitions search stage.
   rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
                                &part_search_state, &best_rdc,
-                               rect_part_win_info);
+                               rect_part_win_info, HORZ, VERT);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, rectangular_partition_search_time);
 #endif
@@ -3784,7 +4228,8 @@ BEGIN_PARTITION_SEARCH:
   // AB partitions search stage.
   ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
                        &part_search_state, &best_rdc, rect_part_win_info,
-                       pb_source_variance, ext_partition_allowed);
+                       pb_source_variance, ext_partition_allowed, HORZ_A,
+                       VERT_B);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, ab_partitions_search_time);
 #endif
@@ -3832,7 +4277,7 @@ BEGIN_PARTITION_SEARCH:
   end_timing(cpi, rd_pick_4partition_time);
 #endif
 
-  if (bsize == cm->seq_params.sb_size &&
+  if (bsize == cm->seq_params->sb_size &&
       !part_search_state.found_best_partition) {
     // Did not find a valid partition, go back and search again, with less
     // constraint on which partition types to search.
@@ -3859,7 +4304,7 @@ BEGIN_PARTITION_SEARCH:
   // prediction block.
   print_partition_timing_stats_with_rdcost(
       part_timing_stats, mi_row, mi_col, bsize,
-      cpi->gf_group.update_type[cpi->gf_group.index],
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
       cm->current_frame.frame_number, &best_rdc, "part_timing.csv");
   /*
   print_partition_timing_stats(part_timing_stats, cm->show_frame,
@@ -3881,11 +4326,14 @@ BEGIN_PARTITION_SEARCH:
   // If a valid partition is found and reconstruction is required for future
   // sub-blocks in the same group.
   if (part_search_state.found_best_partition && pc_tree->index != 3) {
-    if (bsize == cm->seq_params.sb_size) {
+    if (bsize == cm->seq_params->sb_size) {
       // Encode the superblock.
       const int emit_output = multi_pass_mode != SB_DRY_PASS;
       const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
 
+      // Write partition tree to file. Not used by default.
+      if (0) write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+
       set_cb_offsets(x->cb_offset, 0, 0);
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
                 pc_tree, NULL);
@@ -3907,7 +4355,7 @@ BEGIN_PARTITION_SEARCH:
   if (pc_tree_dealloc == 0)
     av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1);
 
-  if (bsize == cm->seq_params.sb_size) {
+  if (bsize == cm->seq_params->sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
   } else {
@@ -3958,7 +4406,7 @@ static int ml_predict_var_paritioning(AV1_COMP *cpi, MACROBLOCK *x,
     const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
     float features[FEATURES] = { 0.0f };
     const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                                      cm->seq_params.bit_depth);
+                                      cm->seq_params->bit_depth);
     int feature_idx = 0;
     float score[LABELS];
 
@@ -4038,7 +4486,7 @@ static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   {
     const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                                      cm->seq_params.bit_depth);
+                                      cm->seq_params->bit_depth);
     int feature_idx = 0;
 
     features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
@@ -4186,7 +4634,7 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
   int partition_none_allowed = !force_horz_split && !force_vert_split;
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);  // Square partition only
-  assert(cm->seq_params.sb_size == BLOCK_64X64);       // Small SB so far
+  assert(cm->seq_params->sb_size == BLOCK_64X64);      // Small SB so far
 
   (void)*tp_orig;
 
@@ -4293,7 +4741,7 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
   fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
 
   if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
+    if (bsize == cm->seq_params->sb_size) {
       // NOTE: To get estimate for rate due to the tokens, use:
       // int rate_coeffs = 0;
       // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_search.h b/third_party/libaom/source/libaom/av1/encoder/partition_search.h
index 136548e3e6..8a6717690c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/partition_search.h
+++ b/third_party/libaom/source/libaom/av1/encoder/partition_search.h
@@ -39,6 +39,13 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
                               RD_STATS *rd_cost, int do_recon, int64_t best_rd,
                               PC_TREE *pc_tree);
 #endif
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TokenExtra **tp,
+                             SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+                             int mi_col, BLOCK_SIZE bsize,
+                             RD_STATS *best_rd_cost);
 bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                            TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
                            int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
@@ -57,12 +64,14 @@ static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
 static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
                                          const int subsampling_x,
                                          const int subsampling_y) {
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, subsampling_x, subsampling_y);
   x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
-  if (x->e_mbd.is_chroma_ref)
+  if (x->e_mbd.is_chroma_ref) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize != BLOCK_INVALID);
     x->cb_offset[PLANE_TYPE_UV] +=
         block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+  }
 }
 
 #endif  // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c
index f846d595bc..bf678a452f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c
+++ b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c
@@ -35,6 +35,48 @@ static AOM_INLINE void simple_motion_search_prune_part_features(
     int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
     int features_to_get);
 
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split);
+
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert);
+
+static bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split);
+
+static bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert);
+
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed);
+
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col);
+
 static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_128X128: return 0;
@@ -45,9 +87,45 @@ static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
     default: assert(0 && "Invalid bsize"); return -1;
   }
 }
-#endif
 
-#if !CONFIG_REALTIME_ONLY
+static char *get_feature_file_name(int id) {
+  static char *feature_file_names[] = {
+    "feature_before_partition_none",
+    "feature_before_partition_none_prune_rect",
+    "feature_after_partition_none_prune",
+    "feature_after_partition_none_terminate",
+    "feature_after_partition_split_terminate",
+    "feature_after_partition_split_prune_rect",
+    "feature_after_partition_rect",
+    "feature_after_partition_ab",
+  };
+
+  return feature_file_names[id];
+}
+
+static void write_features_to_file(const char *const path,
+                                   const bool is_test_mode,
+                                   const float *features,
+                                   const int feature_size, const int id,
+                                   const int bsize, const int mi_row,
+                                   const int mi_col) {
+  if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
+
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/%s", path,
+           get_feature_file_name(id));
+  FILE *pfile = fopen(filename, "a");
+  if (!is_test_mode) {
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size);
+  }
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
 // TODO(chiyotsai@google.com): This is very much a work in progress. We still
 // need to the following:
 //   -- add support for hdres
@@ -61,7 +139,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
                                   int *partition_vert_allowed,
                                   int *do_rectangular_split,
                                   int *do_square_split) {
-  assert(cm->seq_params.sb_size >= BLOCK_64X64 &&
+  assert(cm->seq_params->sb_size >= BLOCK_64X64 &&
          "Invalid sb_size for intra_cnn!");
   const int bsize_idx = convert_bsize_to_idx(bsize);
 
@@ -284,6 +362,20 @@ void av1_simple_motion_search_based_split(
   simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_SPLIT_MODEL_FLAG);
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (ext_ml_model_decision_before_none(
+          cpi, features, partition_none_allowed, partition_horz_allowed,
+          partition_vert_allowed, do_rectangular_split, do_square_split)) {
+    return;
+  }
+
   for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
     features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
   }
@@ -308,7 +400,7 @@ void av1_simple_motion_search_based_split(
   // If the score is very low, prune rectangular split since it is unlikely to
   // occur.
   if (cpi->sf.part_sf.simple_motion_search_rect_split) {
-    const float scale = res_idx >= 2 ? 3 : 2;
+    const float scale = res_idx >= 2 ? 3.0f : 2.0f;
     const float rect_split_thresh =
         scale * av1_simple_motion_search_no_split_thresh
                     [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx]
@@ -356,7 +448,7 @@ static int simple_motion_search_get_best_ref(
       int_mv best_mv =
           av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
                                    start_mvs[ref], num_planes, use_subpixel);
-      curr_var = cpi->fn_ptr[bsize].vf(
+      curr_var = cpi->ppi->fn_ptr[bsize].vf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
           xd->plane[0].dst.stride, &curr_sse);
       if (curr_sse < *best_sse) {
@@ -543,6 +635,24 @@ void av1_simple_motion_search_prune_rect(
   simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) &&
+      (partition_horz_allowed || partition_vert_allowed) &&
+      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+    // Write features to file
+    write_features_to_file(
+        cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode,
+        features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col);
+
+    if (ext_ml_model_decision_before_none_part2(cpi, features, prune_horz,
+                                                prune_vert)) {
+      return;
+    }
+  }
+
   for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
     features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
   }
@@ -617,6 +727,15 @@ void av1_simple_motion_search_early_term_none(
     assert(0 && "Unexpected block size in simple_motion_term_none");
   }
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none_part2(cpi, features, early_terminate)) {
+    return;
+  }
+
   if (ml_model) {
     float score = 0.0f;
     for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
@@ -636,8 +755,9 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
                                         float *features) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
+  // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size.
   assert(sb_size == BLOCK_128X128);
 
   int f_idx = 0;
@@ -701,14 +821,18 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
       if (log_sse > max_log_sse) max_log_sse = log_sse;
     }
   aom_clear_system_state();
-  const float avg_mv_row = sum_mv_row / 64.0f;
-  const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row;
+  const int blks = mb_rows * mb_cols;
+  const float avg_mv_row = sum_mv_row / (float)blks;
+  const float var_mv_row =
+      sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row;
 
-  const float avg_mv_col = sum_mv_col / 64.0f;
-  const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col;
+  const float avg_mv_col = sum_mv_col / (float)blks;
+  const float var_mv_col =
+      sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col;
 
-  const float avg_log_sse = sum_log_sse / 64.0f;
-  const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse;
+  const float avg_log_sse = sum_log_sse / (float)blks;
+  const float var_log_sse =
+      sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse;
 
   features[f_idx++] = avg_log_sse;
   features[f_idx++] = avg_mv_col;
@@ -727,11 +851,20 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
   assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
 }
 
+// Convert result index to block size.
+// result idx     block size
+//     0          BLOCK_16X16
+//     1          BLOCK_32X32
+//     2          BLOCK_64X64
+//     3          BLOCK_128X128
+static BLOCK_SIZE get_block_size(int idx) {
+  return (BLOCK_SIZE)((idx + 2) * 3);
+}
+
 BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
                                      const MACROBLOCK *const x,
                                      const float *features) {
-  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f },
-        probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
   const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
 
   assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
@@ -739,21 +872,26 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
 
   aom_clear_system_state();
   av1_nn_predict(features, nn_config, 1, scores);
-  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
 
   int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
   if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
       DIRECT_PRED) {
     result = 0;
-    float max_prob = probs[0];
+    float max_score = scores[0];
     for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
-      if (probs[i] > max_prob) {
-        max_prob = probs[i];
+      if (scores[i] > max_score) {
+        max_score = scores[i];
         result = i;
       }
     }
-  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
-             RELAXED_PRED) {
+    return get_block_size(result);
+  }
+
+  float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+  if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+      RELAXED_PRED) {
     for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
          --result) {
       if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
@@ -763,7 +901,7 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
     }
   } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
              ADAPT_PRED) {
-    const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+    const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
     const MACROBLOCKD *const xd = &x->e_mbd;
     // TODO(debargha): x->source_variance is unavailable at this point,
     // so compute. The redundant recomputation later can be removed.
@@ -784,7 +922,7 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
     }
   }
 
-  return (BLOCK_SIZE)((result + 2) * 3);
+  return get_block_size(result);
 }
 
 // Get the minimum partition block width and height(in log scale) under a
@@ -911,6 +1049,16 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
 
   assert(f_idx == FEATURES);
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         4, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split(cpi, features,
+                                        terminate_partition_search)) {
+    return;
+  }
+
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
   // Score is indicator of confidence that we should NOT terminate.
@@ -918,10 +1066,11 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
 }
 #undef FEATURES
 
-void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                 int64_t best_rd, int64_t none_rd,
-                                 int64_t *split_rd, int *const dst_prune_horz,
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col, int64_t best_rd,
+                                 int64_t none_rd, int64_t *split_rd,
+                                 int *const dst_prune_horz,
                                  int *const dst_prune_vert) {
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   best_rd = AOMMAX(best_rd, 1);
@@ -998,6 +1147,17 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
     features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         /*feature_size=*/9, 5, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split_part2(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, dst_prune_horz, dst_prune_vert)) {
+    return;
+  }
+
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
   float raw_scores[3] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, raw_scores);
@@ -1014,7 +1174,8 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
 void av1_ml_prune_ab_partition(
-    BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd,
+    AV1_COMP *const cpi, BLOCK_SIZE bsize, const int mi_row, const int mi_col,
+    int part_ctx, int var_ctx, int64_t best_rd,
     int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT],
     int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed,
     int *const horzb_partition_allowed, int *const verta_partition_allowed,
@@ -1065,6 +1226,20 @@ void av1_ml_prune_ab_partition(
   }
   assert(feature_index == 10);
 
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           /*feature_size=*/10, 6, bsize, mi_row, mi_col);
+  }
+
+  if (ext_ml_model_decision_after_rect(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, horza_partition_allowed, horzb_partition_allowed,
+          verta_partition_allowed, vertb_partition_allowed)) {
+    return;
+  }
+
   // Calculate scores using the NN model.
   float score[16] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, score);
@@ -1101,12 +1276,17 @@ void av1_ml_prune_ab_partition(
 #define LABELS 4
 // Use a ML model to predict if horz4 and vert4 should be considered.
 void av1_ml_prune_4_partition(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    int part_ctx, int64_t best_rd,
-    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
     int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
     int *const partition_vert4_allowed, unsigned int pb_source_variance,
     int mi_row, int mi_col) {
+  if (ext_ml_model_decision_after_part_ab(
+          cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd,
+          partition_horz4_allowed, partition_vert4_allowed, pb_source_variance,
+          mi_row, mi_col))
+    return;
+
   if (best_rd >= 1000000000) return;
   int64_t *horz_rd = rect_part_rd[HORZ];
   int64_t *vert_rd = rect_part_rd[VERT];
@@ -1206,6 +1386,13 @@ void av1_ml_prune_4_partition(
   }
   assert(feature_index == FEATURES);
 
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           FEATURES, 7, bsize, mi_row, mi_col);
+  }
+
   // Calculate scores using the NN model.
   float score[LABELS] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, score);
@@ -1238,10 +1425,12 @@ void av1_ml_prune_4_partition(
 #undef LABELS
 
 #define FEATURES 4
-int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance, int bit_depth) {
+void av1_ml_predict_breakout(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                             const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             const PartitionBlkParams blk_params,
+                             unsigned int pb_source_variance, int bit_depth,
+                             int *do_square_split, int *do_rectangular_split) {
   const NN_CONFIG *nn_config = NULL;
   int thresh = 0;
   switch (bsize) {
@@ -1267,7 +1456,7 @@ int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
       break;
     default: assert(0 && "Unexpected bsize.");
   }
-  if (!nn_config || thresh < 0) return 0;
+  if (!nn_config || thresh < 0) return;
 
   const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
   thresh = (int)((float)thresh *
@@ -1295,13 +1484,28 @@ int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
   assert(feature_index == FEATURES);
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         2, blk_params.bsize, blk_params.mi_row,
+                         blk_params.mi_col);
+
+  if (ext_ml_model_decision_after_none(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, do_square_split, do_rectangular_split)) {
+    return;
+  }
+
   // Calculate score using the NN model.
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
   aom_clear_system_state();
 
   // Make decision.
-  return (int)(score * 100) >= thresh;
+  if ((int)(score * 100) >= thresh) {
+    *do_square_split = 0;
+    *do_rectangular_split = 0;
+  }
 }
 #undef FEATURES
 
@@ -1361,7 +1565,7 @@ void av1_prune_partitions_before_search(
   const int try_intra_cnn_split =
       !cpi->use_screen_content_tools && frame_is_intra_only(cm) &&
       cpi->sf.part_sf.intra_cnn_split &&
-      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
       bsize >= BLOCK_8X8 &&
       mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
       mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
@@ -1483,8 +1687,9 @@ int evaluate_ab_partition_based_on_split(
 }
 
 void av1_prune_ab_partitions(
-    const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
-    BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost,
+    AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
+    BLOCK_SIZE bsize, const int mi_row, const int mi_col,
+    int pb_source_variance, int64_t best_rdcost,
     int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
     int64_t split_rd[SUB_PARTITIONS_SPLIT],
     const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed,
@@ -1580,7 +1785,7 @@ void av1_prune_ab_partitions(
     // TODO(huisu@google.com): x->source_variance may not be the current
     // block's variance. The correct one to use is pb_source_variance. Need to
     // re-train the model to fix it.
-    av1_ml_prune_ab_partition(bsize, pc_tree->partitioning,
+    av1_ml_prune_ab_partition(cpi, bsize, mi_row, mi_col, pc_tree->partitioning,
                               get_unsigned_bits(x->source_variance),
                               best_rdcost, horz_rd, vert_rd, split_rd,
                               horza_partition_allowed, horzb_partition_allowed,
@@ -1617,4 +1822,390 @@ void av1_prune_ab_partitions(
   }
 }
 
+// Prepare features for the external model. Specifically, features after
+// ab partition is searched.
+static void prepare_features_after_part_ab(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    int part_ctx, int64_t best_rd,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance,
+    int mi_row, int mi_col, aom_partition_features_t *const features) {
+  int64_t *horz_rd = rect_part_rd[HORZ];
+  int64_t *vert_rd = rect_part_rd[VERT];
+
+  aom_clear_system_state();
+
+  // Generate features.
+  int feature_index = 0;
+  features->after_part_ab.f[feature_index++] = (float)part_ctx;
+  features->after_part_ab.f[feature_index++] =
+      (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features->after_part_ab.f[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      if (is_cur_buf_hbd(xd)) {
+        horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &horz_4_src, horz_4_bs, xd->bd);
+        vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &vert_4_src, vert_4_bs, xd->bd);
+      } else {
+        horz_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs);
+        vert_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs);
+      }
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  assert(feature_index == 18);
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// partition_none_allowed
+// partition_horz_allowed
+// partition_vert_allowed
+// do_rectangular_split
+// do_square_split
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_BEFORE_PART_NONE;
+  for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) {
+    features.before_part_none.f[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *partition_none_allowed = decision.partition_none_allowed;
+  *partition_horz_allowed = decision.partition_rect_allowed[HORZ];
+  *partition_vert_allowed = decision.partition_rect_allowed[VERT];
+  *do_rectangular_split = decision.do_rectangular_split;
+  *do_square_split = decision.do_square_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// prune_horz
+// prune_vert
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_BEFORE_PART_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) {
+    features.before_part_none.f_part2[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_horz = decision.prune_rect_part[HORZ];
+  *prune_vert = decision.prune_rect_part[VERT];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// do_square_split
+// do_rectangular_split
+bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split) {
+  if (!ext_part_controller->ready || is_intra_frame) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_AFTER_PART_NONE;
+  for (int i = 0; i < 4; ++i) {
+    features.after_part_none.f[i] = features_after_none[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *do_square_split = decision.do_square_split;
+  *do_rectangular_split = decision.do_rectangular_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_AFTER_PART_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) {
+    features.after_part_none.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_split(AV1_COMP *const cpi,
+                                       const float *const features_terminate,
+                                       int *terminate_partition_search) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_AFTER_PART_SPLIT;
+  for (int i = 0; i < 31; ++i) {
+    features.after_part_split.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// prune_rect_part[HORZ]
+// prune_rect_part[VERT]
+bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert) {
+  if (is_intra_frame || !ext_part_controller->ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_AFTER_PART_SPLIT_PART2;
+  for (int i = 0; i < 9; ++i) {
+    features.after_part_split.f_prune_rect[i] = features_prune[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_rect_part_horz = decision.prune_rect_part[0];
+  *prune_rect_part_vert = decision.prune_rect_part[1];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after rectangular partition. Specifically, these parameters:
+// horza_partition_allowed
+// horzb_partition_allowed
+// verta_partition_allowed
+// vertb_partition_allowed
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed) {
+  if (is_intra_frame || !ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = FEATURE_AFTER_PART_RECT;
+  for (int i = 0; i < 10; ++i) {
+    features.after_part_rect.f[i] = features_after_rect[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *horza_partition_allowed = decision.horza_partition_allowed;
+  *horzb_partition_allowed = decision.horzb_partition_allowed;
+  *verta_partition_allowed = decision.verta_partition_allowed;
+  *vertb_partition_allowed = decision.vertb_partition_allowed;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after AB partition. Specifically, these parameters:
+// partition_vert4_allowed
+// partition_horz4_allowed
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+
+  if (!frame_is_intra_only(cm) && ext_part_controller->ready) {
+    // Setup features.
+    aom_partition_features_t features;
+    features.id = FEATURE_AFTER_PART_AB;
+    prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd,
+                                   rect_part_rd, split_rd, pb_source_variance,
+                                   mi_row, mi_col, &features);
+
+    // Send necessary features to the external model.
+    av1_ext_part_send_features(ext_part_controller, &features);
+
+    // Get partition decisions from the external model.
+    aom_partition_decision_t decision;
+    const bool valid_decision =
+        av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+    if (!valid_decision) return false;
+
+    // Populate decisions
+    *partition_horz4_allowed = decision.partition_horz4_allowed;
+    *partition_vert4_allowed = decision.partition_vert4_allowed;
+
+    return true;
+  }
+
+  return false;
+}
+
 #endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h
index 0527a944cd..ed66a364d9 100644
--- a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h
+++ b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h
@@ -13,58 +13,10 @@
 #define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
 
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encoder.h"
 
-#define FEATURE_SIZE_SMS_SPLIT_FAST 6
-#define FEATURE_SIZE_SMS_SPLIT 17
-#define FEATURE_SIZE_SMS_PRUNE_PART 25
-#define FEATURE_SIZE_SMS_TERM_NONE 28
-#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
-#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
-#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
-
-#define FEATURE_SMS_NONE_FLAG 1
-#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
-#define FEATURE_SMS_RECT_FLAG (1 << 2)
-
-#define FEATURE_SMS_PRUNE_PART_FLAG \
-  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
-#define FEATURE_SMS_SPLIT_MODEL_FLAG \
-  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
-
-// Number of sub-partitions in rectangular partition types.
-#define SUB_PARTITIONS_RECT 2
-
-// Number of sub-partitions in split partition type.
-#define SUB_PARTITIONS_SPLIT 4
-
-// Number of sub-partitions in AB partition types.
-#define SUB_PARTITIONS_AB 3
-
-// Number of sub-partitions in 4-way partition types.
-#define SUB_PARTITIONS_PART4 4
-
-// 4part parition types.
-enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
-
-// AB parition types.
-enum {
-  HORZ_A = 0,
-  HORZ_B,
-  VERT_A,
-  VERT_B,
-  NUM_AB_PARTS
-} UENUM1BYTE(AB_PART_TYPE);
-
-// Rectangular parition types.
-enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
-
-// Structure to keep win flags for HORZ and VERT partition evaluations.
-typedef struct {
-  int rect_part_win[NUM_RECT_PARTS];
-} RD_RECT_PART_WIN_INFO;
-
 void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
                                   int bsize, int label_idx,
                                   int *partition_none_allowed,
@@ -129,16 +81,18 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
 // no information about rectangular partitions. Preliminary experiments suggest
 // that we can get better performance by adding in q_index and rectangular
 // sse/var from SMS. We should retrain and tune this model later.
-void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                 int64_t best_rd, int64_t none_rd,
-                                 int64_t *split_rd, int *const dst_prune_horz,
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col, int64_t best_rd,
+                                 int64_t none_rd, int64_t *split_rd,
+                                 int *const dst_prune_horz,
                                  int *const dst_prune_vert);
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
 void av1_ml_prune_ab_partition(
-    BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd,
+    AV1_COMP *const cpi, BLOCK_SIZE bsize, const int mi_row, const int mi_col,
+    int part_ctx, int var_ctx, int64_t best_rd,
     int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT],
     int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed,
     int *const horzb_partition_allowed, int *const verta_partition_allowed,
@@ -146,18 +100,19 @@ void av1_ml_prune_ab_partition(
 
 // Use a ML model to predict if horz4 and vert4 should be considered.
 void av1_ml_prune_4_partition(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    int part_ctx, int64_t best_rd,
-    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
     int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
     int *const partition_vert4_allowed, unsigned int pb_source_variance,
     int mi_row, int mi_col);
 
 // ML-based partition search breakout after PARTITION_NONE.
-int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance, int bit_depth);
+void av1_ml_predict_breakout(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                             const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             const PartitionBlkParams blk_params,
+                             unsigned int pb_source_variance, int bit_depth,
+                             int *do_square_split, int *do_rectangular_split);
 
 // The first round of partition pruning determined before any partition
 // has been tested. The decisions will be updated and passed back
@@ -183,8 +138,9 @@ void av1_prune_partitions_by_max_min_bsize(
 // Prune out AB partitions based on rd decisions made from testing the
 // basic partitions.
 void av1_prune_ab_partitions(
-    const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
-    BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost,
+    AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
+    BLOCK_SIZE bsize, const int mi_row, const int mi_col,
+    int pb_source_variance, int64_t best_rdcost,
     int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
     int64_t split_rd[SUB_PARTITIONS_SPLIT],
     const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed,
@@ -261,22 +217,66 @@ static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
          (mi_col + sb_mi_wide) <= mi_params->mi_cols;
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Do not use this criteria for screen content videos.
 // Since screen content videos could often find good predictors and the largest
 // block size is likely to be used.
 static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
                                          BLOCK_SIZE sb_size, int mi_row,
                                          int mi_col) {
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const AV1_COMMON *const cm = &cpi->common;
   return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
          cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
              NOT_IN_USE &&
          sb_size == BLOCK_128X128 &&
          is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
-         cpi->gf_group.update_type[cpi->gf_group.index] != OVERLAY_UPDATE &&
-         cpi->gf_group.update_type[cpi->gf_group.index] != INTNL_OVERLAY_UPDATE;
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             OVERLAY_UPDATE &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             INTNL_OVERLAY_UPDATE;
 }
 
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
+  }
+}
+
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+                                                  AV1_COMP *cpi, MACROBLOCK *x,
+                                                  const SPEED_FEATURES *sf,
+                                                  BLOCK_SIZE sb_size,
+                                                  int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  sb_enc->max_partition_size =
+      AOMMIN(sf->part_sf.default_max_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+  sb_enc->min_partition_size =
+      AOMMAX(sf->part_sf.default_min_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+  sb_enc->max_partition_size =
+      AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size);
+  sb_enc->min_partition_size =
+      AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size);
+
+  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+    sb_enc->max_partition_size =
+        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+                      sb_enc->max_partition_size),
+               sb_enc->min_partition_size);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c b/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c
index 804fb3a510..e3639f7784 100644
--- a/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c
+++ b/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c
@@ -43,6 +43,13 @@
 #define DEFAULT_KF_BOOST 2300
 #define DEFAULT_GF_BOOST 2000
 #define GROUP_ADAPTIVE_MAXQ 1
+
+static INLINE int is_fp_stats_to_predict_flat_gop_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  return ((fp_stats->tr_coded_error < 0) || (fp_stats->pcnt_third_ref < 0) ||
+          (fp_stats->frame_avg_wavelet_energy < 0));
+}
+
 static void init_gf_stats(GF_GROUP_STATS *gf_stats);
 
 // Calculate an active area of the image that discounts formatting
@@ -182,7 +189,7 @@ static double calc_correction_factor(double err_per_mb, int q) {
 
 // Based on history adjust expectations of bits per macroblock.
 static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
   const RATE_CONTROL *const rc = &cpi->rc;
   int err_estimate = rc->rate_error_estimate;
 
@@ -194,14 +201,14 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
   const double max_fac = 1.0 + adj_limit;
 
   if (rc->vbr_bits_off_target && rc->total_actual_bits > 0) {
-    if (cpi->lap_enabled) {
+    if (cpi->ppi->lap_enabled) {
       rate_err_factor =
           (double)twopass->rolling_arf_group_actual_bits /
           DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
     } else {
       rate_err_factor =
           1.0 - ((double)(rc->vbr_bits_off_target) /
-                 AOMMAX(rc->total_actual_bits, cpi->twopass.bits_left));
+                 AOMMAX(rc->total_actual_bits, cpi->ppi->twopass.bits_left));
     }
 
     rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
@@ -209,7 +216,7 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
     // Adjustment is damped if this is 1 pass with look ahead processing
     // (as there are only ever a few frames of data) and for all but the first
     // GOP in normal two pass.
-    if ((twopass->bpm_factor != 1.0) || cpi->lap_enabled) {
+    if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
       rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
     }
   }
@@ -302,9 +309,9 @@ static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     int q = find_qindex_by_rate_with_correction(
-        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth,
-        av_err_per_mb, cpi->twopass.bpm_factor, rate_err_tol, rc->best_quality,
-        rc->worst_quality);
+        target_norm_bits_per_mb, cpi->common.seq_params->bit_depth,
+        av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol,
+        rc->best_quality, rc->worst_quality);
 
     // Restriction on active max q for constrained quality mode.
     if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
@@ -312,57 +319,63 @@ static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
   }
 }
 
-#define SR_DIFF_PART 0.0015
-#define MOTION_AMP_PART 0.003
 #define INTRA_PART 0.005
 #define DEFAULT_DECAY_LIMIT 0.75
 #define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
 #define NCOUNT_FRAME_II_THRESH 5.0
+#define LOW_CODED_ERR_PER_MB 10.0
 
-static double get_sr_decay_rate(const FRAME_INFO *frame_info,
-                                const FIRSTPASS_STATS *frame) {
-  const int num_mbs = frame_info->num_mbs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) {
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
   double modified_pct_inter;
   double modified_pcnt_intra;
-  const double motion_amplitude_factor =
-      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
 
   modified_pct_inter = frame->pcnt_inter;
-  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-      (double)NCOUNT_FRAME_II_THRESH) {
+  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+       (double)NCOUNT_FRAME_II_THRESH)) {
     modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
   }
   modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
-               (MOTION_AMP_PART * motion_amplitude_factor) -
-               (INTRA_PART * modified_pcnt_intra);
+    double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error);
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
-  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const FRAME_INFO *frame_info,
-                                     const FIRSTPASS_STATS *frame) {
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
   const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, frame);
+  double sr_decay = get_sr_decay_rate(frame);
   return AOMMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
+#define DEFAULT_ZM_FACTOR 0.5
+static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) {
+  const double sr_decay_rate = get_sr_decay_rate(frame_stats);
+  double zero_motion_factor =
+      DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
-static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
-                                        const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame);
-  const double zero_motion_factor =
-      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
-                  ZM_POWER_FACTOR));
+  // Clamp value to range 0.0 to 1.0
+  // This should happen anyway if input values are sensibly clamped but checked
+  // here just in case.
+  if (zero_motion_factor > 1.0)
+    zero_motion_factor = 1.0;
+  else if (zero_motion_factor < 0.0)
+    zero_motion_factor = 0.0;
 
   return AOMMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
@@ -449,7 +462,6 @@ static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
 }
 
 static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
-                                        const FRAME_INFO *frame_info,
                                         const int flash_detected,
                                         const int frames_since_key,
                                         const int cur_idx,
@@ -470,16 +482,15 @@ static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
   // Accumulate the effect of prediction quality decay
   if (!flash_detected) {
     gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
-    gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats);
+    gf_stats->loop_decay_rate = get_prediction_decay_rate(stats);
 
     gf_stats->decay_accumulator =
         gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
 
     // Monitor for static sections.
     if ((frames_since_key + cur_idx - 1) > 1) {
-      gf_stats->zero_motion_accumulator =
-          AOMMIN(gf_stats->zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, stats));
+      gf_stats->zero_motion_accumulator = AOMMIN(
+          gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats));
     }
   }
 }
@@ -618,8 +629,8 @@ static double calc_kf_frame_boost(const RATE_CONTROL *rc,
   return AOMMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
-                                   int frames_to_project,
+static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc,
+                                   int gfu_boost, int frames_to_project,
                                    int num_stats_used_for_gfu_boost) {
   /*
    * If frames_to_project is equal to num_stats_used_for_gfu_boost,
@@ -629,7 +640,7 @@ static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
    */
   if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
 
-  double min_boost_factor = sqrt(rc->baseline_gf_interval);
+  double min_boost_factor = sqrt(p_rc->baseline_gf_interval);
   // Get the current tpl factor (number of frames = frames_to_project).
   double tpl_factor = av1_get_gfu_boost_projection_factor(
       min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
@@ -642,11 +653,13 @@ static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
 }
 
 #define GF_MAX_BOOST 90.0
+#define GF_MIN_BOOST 50
 #define MIN_DECAY_FACTOR 0.01
-int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const PRIMARY_RATE_CONTROL *p_rc, const RATE_CONTROL *rc,
                        FRAME_INFO *frame_info, int offset, int f_frames,
                        int b_frames, int *num_fpstats_used,
-                       int *num_fpstats_required) {
+                       int *num_fpstats_required, int project_gfu_boost) {
   int i;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
@@ -670,8 +683,7 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      gf_stats.decay_accumulator *=
-          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
       gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
                                        ? MIN_DECAY_FACTOR
                                        : gf_stats.decay_accumulator;
@@ -704,8 +716,7 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      gf_stats.decay_accumulator *=
-          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
       gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
                                        ? MIN_DECAY_FACTOR
                                        : gf_stats.decay_accumulator;
@@ -719,16 +730,16 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
   }
   arf_boost += (int)boost_score;
 
-  if (num_fpstats_required) {
+  if (project_gfu_boost) {
+    assert(num_fpstats_required != NULL);
+    assert(num_fpstats_used != NULL);
     *num_fpstats_required = f_frames + b_frames;
-    if (num_fpstats_used) {
-      arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required,
-                                          *num_fpstats_used);
-    }
+    arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required,
+                                        *num_fpstats_used);
   }
 
-  if (arf_boost < ((b_frames + f_frames) * 50))
-    arf_boost = ((b_frames + f_frames) * 50);
+  if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST))
+    arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST);
 
   return arf_boost;
 }
@@ -767,7 +778,8 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
 static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
                                              double gf_group_err) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  const TWO_PASS *const twopass = &cpi->twopass;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
   const int max_bits = frame_max_bits(rc, &cpi->oxcf);
   int64_t total_group_bits;
 
@@ -787,8 +799,8 @@ static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
                                : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
-  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval;
 
   return total_group_bits;
 }
@@ -834,7 +846,8 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
                                               int64_t group_bits,
                                               int frame_type) {
   const AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
   for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
@@ -845,7 +858,7 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
     }
 
     const AV1_LEVEL target_level =
-        cpi->level_params.target_seq_level_idx[index];
+        cpi->ppi->level_params.target_seq_level_idx[index];
     if (target_level >= SEQ_LEVELS) continue;
 
     assert(is_valid_seq_level_idx(target_level));
@@ -859,18 +872,20 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
       const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
       if (bits_assigned > level_enforced_max_kf_bits) {
         const int frames = rc->frames_to_key - 1;
-        rc->kf_boost = calculate_boost_factor(
+        p_rc->kf_boost = calculate_boost_factor(
             frames, level_enforced_max_kf_bits, group_bits);
-        bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits);
+        bits_assigned =
+            calculate_boost_bits(frames, p_rc->kf_boost, group_bits);
       }
     } else if (frame_type == 1) {
       // Maximum bits for arf is 4 times the target_bits_per_frame.
       const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
       if (bits_assigned > level_enforced_max_arf_bits) {
-        rc->gfu_boost = calculate_boost_factor(
-            rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits);
-        bits_assigned = calculate_boost_bits(rc->baseline_gf_interval,
-                                             rc->gfu_boost, group_bits);
+        p_rc->gfu_boost =
+            calculate_boost_factor(p_rc->baseline_gf_interval,
+                                   level_enforced_max_arf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval,
+                                             p_rc->gfu_boost, group_bits);
       }
     } else {
       assert(0);
@@ -883,7 +898,9 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
 // Allocate bits to each frame in a GF / ARF group
 double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
                                               0.60, 1.0,  1.0 };
-static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+static void allocate_gf_group_bits(GF_GROUP *gf_group,
+                                   PRIMARY_RATE_CONTROL *const p_rc,
+                                   RATE_CONTROL *const rc,
                                    int64_t gf_group_bits, int gf_arf_bits,
                                    int key_frame, int use_arf) {
   int64_t total_group_bits = gf_group_bits;
@@ -900,7 +917,7 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
   if (use_arf) total_group_bits -= gf_arf_bits;
 
   int num_frames =
-      AOMMAX(1, rc->baseline_gf_interval - (rc->frames_since_key == 0));
+      AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0));
   base_frame_bits = (int)(total_group_bits / num_frames);
 
   // Check the number of frames in each layer in case we have a
@@ -943,7 +960,8 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
   // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
   // Setting this frame to use 0 bit (of out the current GOP budget) will
   // simplify logics in reference frame management.
-  gf_group->bit_allocation[gf_group_size] = 0;
+  if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH)
+    gf_group->bit_allocation[gf_group_size] = 0;
 }
 
 // Returns true if KF group and GF group both are almost completely static.
@@ -967,7 +985,7 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
                                 int active_min_gf_interval,
                                 GF_GROUP_STATS *gf_stats) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
   // Motion breakout threshold for loop below depends on image size.
   const double mv_ratio_accumulator_thresh =
@@ -997,12 +1015,71 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
   // so we can continue for more frames.
   if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
       !is_almost_static(gf_stats->zero_motion_accumulator,
-                        twopass->kf_zeromotion_pct, cpi->lap_enabled)) {
+                        twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
     return 1;
   }
   return 0;
 }
 
+static int is_shorter_gf_interval_better(AV1_COMP *cpi,
+                                         EncodeFrameParams *frame_params,
+                                         const EncodeFrameInput *frame_input) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+  int shorten_gf_interval;
+
+  av1_tpl_preload_rc_estimate(cpi, frame_params);
+
+  if (gop_length_decision_method == 2) {
+    // GF group length is decided based on GF boost and tpl stats of ARFs from
+    // base layer, (base+1) layer.
+    shorten_gf_interval =
+        (p_rc->gfu_boost <
+         p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+        !av1_tpl_setup_stats(cpi, 3, frame_params, frame_input);
+  } else {
+    int do_complete_tpl = 1;
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    int is_temporal_filter_enabled =
+        (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+    if (is_temporal_filter_enabled) {
+      int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
+      FRAME_UPDATE_TYPE arf_update_type =
+          gf_group->update_type[gf_group->arf_index];
+      int is_forward_keyframe = 0;
+      av1_temporal_filter(cpi, arf_src_index, arf_update_type,
+                          is_forward_keyframe, NULL);
+      aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer,
+                               av1_num_planes(&cpi->common));
+    }
+
+    if (gop_length_decision_method == 1) {
+      // Check if tpl stats of ARFs from base layer, (base+1) layer,
+      // (base+2) layer can decide the GF group length.
+      int gop_length_eval =
+          av1_tpl_setup_stats(cpi, 2, frame_params, frame_input);
+
+      if (gop_length_eval != 2) {
+        do_complete_tpl = 0;
+        shorten_gf_interval = !gop_length_eval;
+      }
+    }
+
+    if (do_complete_tpl) {
+      // Decide GF group length based on complete tpl stats.
+      shorten_gf_interval =
+          !av1_tpl_setup_stats(cpi, 1, frame_params, frame_input);
+      // Tpl stats is reused when the ARF is temporally filtered and GF
+      // interval is not shortened.
+      if (is_temporal_filter_enabled && !shorten_gf_interval)
+        cpi->skip_tpl_setup_stats = 1;
+    }
+  }
+  return shorten_gf_interval;
+}
+
 #define MIN_FWD_KF_INTERVAL 8
 #define MIN_SHRINK_LEN 6  // the minimum length of gf if we are shrinking
 #define SMOOTH_FILT_LEN 7
@@ -1014,17 +1091,16 @@ const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
                                               0.242, 0.061, 0.006 };
 
 // Smooth filter intra_error and coded_error in firstpass stats.
-// If ignore[i]==1, the ith element should not be used in the filtering.
-static void smooth_filter_stats(const FIRSTPASS_STATS *stats, const int *ignore,
-                                int start_idx, int last_idx,
-                                double *filt_intra_err,
+// If stats[i].is_flash==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
+                                int last_idx, double *filt_intra_err,
                                 double *filt_coded_err) {
   int i, j;
   for (i = start_idx; i <= last_idx; i++) {
     double total_wt = 0;
     for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
       int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
-      if (ignore[idx]) continue;
+      if (stats[idx].is_flash) continue;
 
       filt_intra_err[i] +=
           smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
@@ -1041,7 +1117,7 @@ static void smooth_filter_stats(const FIRSTPASS_STATS *stats, const int *ignore,
     for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
       int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
       // Coded error involves idx and idx - 1.
-      if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue;
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
 
       filt_coded_err[i] +=
           smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
@@ -1070,7 +1146,7 @@ static void get_gradient(const double *values, int start, int last,
 }
 
 static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
-                              int first, int last, int *ignore) {
+                              int first, int last) {
   // Identify unstable areas caused by scenecuts.
   // Find the max and 2nd max coded error, and the average of the rest frames.
   // If there is only one frame that yields a huge coded error, it is likely a
@@ -1081,14 +1157,16 @@ static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
   if (last - first == 0) return -1;
 
   for (int i = first; i <= last; i++) {
-    if (ignore[i] || (i > 0 && ignore[i - 1])) continue;
+    if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+      continue;
     double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
     this_ratio = stats_start[i].coded_error / temp_intra;
     // find the avg ratio in the preceding neighborhood
     max_prev_ratio = 0;
     max_prev_coded = 0;
     for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
-      if (ignore[j] || (j > 0 && ignore[j - 1])) continue;
+      if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash))
+        continue;
       temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
       double temp_ratio = stats_start[j].coded_error / temp_intra;
       if (temp_ratio > max_prev_ratio) {
@@ -1102,7 +1180,8 @@ static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
     max_next_ratio = 0;
     max_next_coded = 0;
     for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
-      if (ignore[j] || (j > 0 && ignore[j - 1])) continue;
+      if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+        continue;
       temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
       double temp_ratio = stats_start[j].coded_error / temp_intra;
       if (temp_ratio > max_next_ratio) {
@@ -1135,19 +1214,6 @@ static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
   return -1;
 }
 
-static void mark_flashes(const FIRSTPASS_STATS *stats, int start_idx,
-                         int last_idx, int *is_flash) {
-  int i;
-  for (i = start_idx; i < last_idx; i++) {
-    if (stats[i + 1].pcnt_second_ref > stats[i + 1].pcnt_inter &&
-        stats[i + 1].pcnt_second_ref >= 0.5) {
-      // this is a new flash frame
-      is_flash[i] = 1;
-      continue;
-    }
-  }
-}
-
 // Remove the region with index next_region.
 // parameter merge: 0: merge with previous; 1: merge with next; 2:
 // merge with both, take type from previous if possible
@@ -1220,46 +1286,10 @@ static void insert_region(int start, int last, REGION_TYPES type,
   *cur_region_idx = k;
 }
 
-// Estimate the noise variance of each frame from the first pass stats
-static void estimate_region_noise(const FIRSTPASS_STATS *stats,
-                                  const int *is_flash, REGIONS *region) {
-  double C1, C2, C3, noise;
-  int count = 0;
-  region->avg_noise_var = -1;
-  for (int i = region->start + 2; i <= region->last; i++) {
-    if (is_flash[i] || is_flash[i - 1] || is_flash[i - 2]) continue;
-
-    C1 = stats[i - 1].intra_error *
-         (stats[i].intra_error - stats[i].coded_error);
-    C2 = stats[i - 2].intra_error *
-         (stats[i - 1].intra_error - stats[i - 1].coded_error);
-    C3 = stats[i - 2].intra_error *
-         (stats[i].intra_error - stats[i].sr_coded_error);
-    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
-    C1 = sqrt(C1);
-    C2 = sqrt(C2);
-    C3 = sqrt(C3);
-
-    noise = stats[i - 1].intra_error - C1 * C2 / C3;
-    noise = AOMMAX(noise, 0.01);
-    region->avg_noise_var = (region->avg_noise_var == -1)
-                                ? noise
-                                : AOMMIN(noise, region->avg_noise_var);
-    count++;
-  }
-  if (count == 0) {
-    region->avg_noise_var = 0;
-  }
-}
-
-// Analyze the corrrelation coefficient of each frame with its previous frame in
-// a region. Also get the average of stats inside a region.
-// Before calling this function, the region's noise variance is needed.
-static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx,
-                           REGIONS *regions, double *coeff) {
-  double cor_coeff;
-
-  int i, k = region_idx;
+// Get the average of stats inside a region.
+static void analyze_region(const FIRSTPASS_STATS *stats, int k,
+                           REGIONS *regions) {
+  int i;
   regions[k].avg_cor_coeff = 0;
   regions[k].avg_sr_fr_ratio = 0;
   regions[k].avg_intra_err = 0;
@@ -1268,12 +1298,6 @@ static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx,
   int check_first_sr = (k != 0);
 
   for (i = regions[k].start; i <= regions[k].last; i++) {
-    double C = sqrt(AOMMAX(stats[i - 1].intra_error *
-                               (stats[i].intra_error - stats[i].coded_error),
-                           0.001));
-    cor_coeff =
-        C / AOMMAX(stats[i - 1].intra_error - regions[k].avg_noise_var, 0.001);
-
     if (i > regions[k].start || check_first_sr) {
       double num_frames =
           (double)(regions[k].last - regions[k].start + check_first_sr);
@@ -1289,85 +1313,27 @@ static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx,
     regions[k].avg_coded_err +=
         stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
 
-    coeff[i] =
-        cor_coeff *
-        sqrt(
-            AOMMAX(stats[i - 1].intra_error - regions[k].avg_noise_var, 0.001) /
-            AOMMAX(stats[i].intra_error - regions[k].avg_noise_var, 0.001));
-    // clip correlation coefficient.
-    coeff[i] = AOMMIN(AOMMAX(coeff[i], 0), 1);
-
     regions[k].avg_cor_coeff +=
-        coeff[i] / (double)(regions[k].last - regions[k].start + 1);
+        AOMMAX(stats[i].cor_coeff, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
+    regions[k].avg_noise_var +=
+        AOMMAX(stats[i].noise_var, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
   }
 }
 
-// Calculate the regions stats of every region. Uses the stable regions to
-// estimate noise variance of other regions. Then call analyze_region for each.
-static void get_region_stats(const FIRSTPASS_STATS *stats, const int *is_flash,
-                             REGIONS *regions, double *coeff, int num_regions) {
-  int k, count_stable = 0;
-  // Analyze stable regions.
-  for (k = 0; k < num_regions; k++) {
-    if (regions[k].type == STABLE_REGION) {
-      estimate_region_noise(stats, is_flash, regions + k);
-      analyze_region(stats, k, regions, coeff);
-      count_stable++;
-    }
-  }
-
-  if (count_stable == 0) {
-    // no stable region, just use the lowest noise variance estimated.
-    double lowest_noise = -1;
-    for (k = 0; k < num_regions; k++) {
-      if (regions[k].type == SCENECUT_REGION) continue;
-      estimate_region_noise(stats, is_flash, regions + k);
-      if (regions[k].avg_noise_var < 0.01) continue;
-      if (lowest_noise < 0 || lowest_noise > regions[k].avg_noise_var) {
-        lowest_noise = regions[k].avg_noise_var;
-      }
-    }
-    lowest_noise = AOMMAX(lowest_noise, 0);
-    for (k = 0; k < num_regions; k++) {
-      regions[k].avg_noise_var = lowest_noise;
-      analyze_region(stats, k, regions, coeff);
-    }
-    return;
-  }
-
-  // Analyze other regions
-  for (k = 0; k < num_regions; k++) {
-    if (regions[k].type != STABLE_REGION) {
-      // use the average of the nearest previous and next stable regions
-      int count = 0;
-      regions[k].avg_noise_var = 0;
-      for (int r = k - 1; r >= 0; r--) {
-        if (regions[r].type == STABLE_REGION) {
-          count++;
-          regions[k].avg_noise_var += regions[r].avg_noise_var;
-          break;
-        }
-      }
-      for (int r = k + 1; r < num_regions; r++) {
-        if (regions[r].type == STABLE_REGION) {
-          count++;
-          regions[k].avg_noise_var += regions[r].avg_noise_var;
-          break;
-        }
-      }
-      if (count) {
-        regions[k].avg_noise_var /= (double)count;
-      }
-      analyze_region(stats, k, regions, coeff);
-    }
+// Calculate the regions stats of every region.
+static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions,
+                             int num_regions) {
+  for (int k = 0; k < num_regions; k++) {
+    analyze_region(stats, k, regions);
   }
 }
 
 // Find tentative stable regions
 static int find_stable_regions(const FIRSTPASS_STATS *stats,
-                               const double *grad_coded, const int *ignore,
-                               int this_start, int this_last,
-                               REGIONS *regions) {
+                               const double *grad_coded, int this_start,
+                               int this_last, REGIONS *regions) {
   int i, j, k = 0;
   regions[k].start = this_start;
   for (i = this_start; i <= this_last; i++) {
@@ -1377,7 +1343,7 @@ static int find_stable_regions(const FIRSTPASS_STATS *stats,
     int count = 0;
     for (j = -HALF_WIN; j <= HALF_WIN; j++) {
       int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
-      if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue;
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
       mean_intra += stats[idx].intra_error;
       var_intra += stats[idx].intra_error * stats[idx].intra_error;
       mean_coded += stats[idx].coded_error;
@@ -1451,15 +1417,13 @@ static void remove_short_regions(REGIONS *regions, int *num_regions,
 }
 
 static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
-                                          const int *is_flash,
-                                          const double *grad, REGIONS *regions,
-                                          double *coeff, int *num_regions) {
+                                          REGIONS *regions, int *num_regions) {
   int i, j, k;
   // Remove regions that are too short. Likely noise.
   remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
   remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
 
-  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
 
   // Adjust region boundaries. The thresholds are empirically obtained, but
   // overall the performance is not very sensitive to small changes to them.
@@ -1469,34 +1433,24 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
       // Adjust previous boundary.
       // First find the average intra/coded error in the previous
       // neighborhood.
-      double avg_intra_err = 0, avg_coded_err = 0, avg_coeff = 0;
-      int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
-                          regions[k - 1].start + 1);
-      int lasti = regions[k - 1].last;
+      double avg_intra_err = 0;
+      const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
+                                regions[k - 1].start + 1);
+      const int lasti = regions[k - 1].last;
       int counti = 0;
       for (i = starti; i <= lasti; i++) {
         avg_intra_err += stats[i].intra_error;
-        avg_coded_err += stats[i].coded_error;
-        avg_coeff += coeff[i];
         counti++;
       }
       if (counti > 0) {
         avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
-        avg_coded_err /= AOMMAX(avg_coded_err / (double)counti, 0.001);
-        avg_coeff /= AOMMIN(avg_intra_err / (double)counti, 0.99999);
         int count_coded = 0, count_grad = 0;
         for (j = lasti + 1; j <= regions[k].last; j++) {
-          int intra_close =
+          const int intra_close =
               fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
-          int coded_close =
-              fabs(stats[j].coded_error - avg_coded_err) / avg_coded_err < 0.15;
-          int grad_small = fabs(grad[j]) / avg_coded_err < 0.05;
-          int coded_small = stats[j].coded_error / avg_intra_err < 0.03;
-          int coeff_close =
-              (1 - coeff[j]) / (1 - avg_coeff) < 1.5 || coeff[j] > 0.995;
-          if (!coeff_close || (!coded_close && !coded_small)) count_coded--;
-          if (!grad_small && !coded_small) count_grad--;
-
+          const int coded_small = stats[j].coded_error / avg_intra_err < 0.1;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
+          if (!coeff_close || !coded_small) count_coded--;
           if (intra_close && count_coded >= 0 && count_grad >= 0) {
             // this frame probably belongs to the previous stable region
             regions[k - 1].last = j;
@@ -1510,35 +1464,26 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
     if (k < *num_regions - 1) {
       // Adjust next boundary.
       // First find the average intra/coded error in the next neighborhood.
-      double avg_intra_err = 0, avg_coded_err = 0, avg_coeff = 0;
-      int starti = regions[k + 1].start;
-      int lasti = AOMMIN(regions[k + 1].last - 1,
-                         regions[k + 1].start + WINDOW_SIZE - 1);
+      double avg_intra_err = 0;
+      const int starti = regions[k + 1].start;
+      const int lasti = AOMMIN(regions[k + 1].last - 1,
+                               regions[k + 1].start + WINDOW_SIZE - 1);
       int counti = 0;
       for (i = starti; i <= lasti; i++) {
         avg_intra_err += stats[i].intra_error;
-        avg_coded_err += stats[i + 1].coded_error;
-        avg_coeff += coeff[i];
         counti++;
       }
       if (counti > 0) {
         avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
-        avg_coded_err /= AOMMAX(avg_coded_err / (double)counti, 0.001);
-        avg_coeff /= AOMMIN(avg_intra_err / (double)counti, 0.99999);
         // At the boundary, coded error is large, but still the frame is stable
         int count_coded = 1, count_grad = 1;
         for (j = starti - 1; j >= regions[k].start; j--) {
-          int intra_close =
+          const int intra_close =
               fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
-          int coded_close =
-              fabs(stats[j + 1].coded_error - avg_coded_err) / avg_coded_err <
-              0.15;
-          int grad_small = fabs(grad[j + 1]) / avg_coded_err < 0.05;
-          int coded_small = stats[j + 1].coded_error / avg_intra_err < 0.03;
-          int coeff_close =
-              (1 - coeff[j + 1]) / (1 - avg_coeff) < 1.5 || coeff[j] > 0.995;
-          if (!coeff_close || (!coded_close && !coded_small)) count_coded--;
-          if (!grad_small && !coded_small) count_grad--;
+          const int coded_small =
+              stats[j + 1].coded_error / avg_intra_err < 0.1;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
+          if (!coeff_close || !coded_small) count_coded--;
           if (intra_close && count_coded >= 0 && count_grad >= 0) {
             // this frame probably belongs to the next stable region
             regions[k + 1].start = j;
@@ -1553,7 +1498,7 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
 
   cleanup_regions(regions, num_regions);
   remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
-  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
 
   // If a stable regions has higher error than neighboring high var regions,
   // or if the stable region has a lower average correlation,
@@ -1561,25 +1506,31 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
   k = 0;
   while (k < *num_regions && (*num_regions) > 1) {
     if (regions[k].type == STABLE_REGION &&
+        (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
         ((k > 0 &&  // previous regions
-          (regions[k].avg_coded_err > regions[k - 1].avg_coded_err ||
-           regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff)) &&
+          (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 ||
+           regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) &&
          (k < *num_regions - 1 &&  // next region
-          (regions[k].avg_coded_err > regions[k + 1].avg_coded_err ||
-           regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff)))) {
+          (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 ||
+           regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) {
       // merge current region with the previous and next regions
       remove_region(2, regions, num_regions, &k);
-      analyze_region(stats, k - 1, regions, coeff);
+      analyze_region(stats, k - 1, regions);
     } else if (regions[k].type == HIGH_VAR_REGION &&
+               (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
                ((k > 0 &&  // previous regions
-                 (regions[k].avg_coded_err < regions[k - 1].avg_coded_err ||
-                  regions[k].avg_cor_coeff > regions[k - 1].avg_cor_coeff)) &&
+                 (regions[k].avg_coded_err <
+                      regions[k - 1].avg_coded_err * 0.99 ||
+                  regions[k].avg_cor_coeff >
+                      regions[k - 1].avg_cor_coeff * 1.001)) &&
                 (k < *num_regions - 1 &&  // next region
-                 (regions[k].avg_coded_err < regions[k + 1].avg_coded_err ||
-                  regions[k].avg_cor_coeff > regions[k + 1].avg_cor_coeff)))) {
+                 (regions[k].avg_coded_err <
+                      regions[k + 1].avg_coded_err * 0.99 ||
+                  regions[k].avg_cor_coeff >
+                      regions[k + 1].avg_cor_coeff * 1.001)))) {
       // merge current region with the previous and next regions
       remove_region(2, regions, num_regions, &k);
-      analyze_region(stats, k - 1, regions, coeff);
+      analyze_region(stats, k - 1, regions);
     } else {
       k++;
     }
@@ -1591,8 +1542,7 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
 
 // Identify blending regions.
 static void find_blending_regions(const FIRSTPASS_STATS *stats,
-                                  const int *is_flash, REGIONS *regions,
-                                  int *num_regions, double *coeff) {
+                                  REGIONS *regions, int *num_regions) {
   int i, k = 0;
   // Blending regions will have large content change, therefore will have a
   // large consistent change in intra error.
@@ -1607,7 +1557,8 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats,
     int start = 0, last;
     for (i = regions[k].start; i <= regions[k].last; i++) {
       // First mark the regions that has consistent large change of intra error.
-      if (is_flash[i] || (i > 0 && is_flash[i - 1])) continue;
+      if (k == 0 && i == regions[k].start) continue;
+      if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue;
       double grad = stats[i].intra_error - stats[i - 1].intra_error;
       int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
       int this_dir = 0;
@@ -1622,7 +1573,11 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats,
         insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
       }
       dir = this_dir;
-      start = i;
+      if (k == 0 && i == regions[k].start + 1) {
+        start = i - 1;
+      } else {
+        start = i;
+      }
     }
     if (dir != 0) {
       last = regions[k].last;
@@ -1633,14 +1588,14 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats,
 
   // If the blending region has very low correlation, mark it as high variance
   // since we probably cannot benefit from it anyways.
-  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
   for (k = 0; k < *num_regions; k++) {
     if (regions[k].type != BLENDING_REGION) continue;
     if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
         count_stable == 0)
       regions[k].type = HIGH_VAR_REGION;
   }
-  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
 
   // It is possible for blending to result in a "dip" in intra error (first
   // decrease then increase). Therefore we need to find the dip and combine the
@@ -1669,7 +1624,7 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats,
           if (regions[k].avg_sr_fr_ratio > ratio_thres) {
             regions[k].type = BLENDING_REGION;
             remove_region(2, regions, num_regions, &k);
-            analyze_region(stats, k - 1, regions, coeff);
+            analyze_region(stats, k - 1, regions);
             continue;
           }
         }
@@ -1727,7 +1682,7 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats,
 
       if (to_merge) {
         remove_region(0, regions, num_regions, &k);
-        analyze_region(stats, k - 1, regions, coeff);
+        analyze_region(stats, k - 1, regions);
         continue;
       } else {
         // These are possibly two separate blending regions. Mark the boundary
@@ -1735,9 +1690,9 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats,
         int prev_k = k - 1;
         insert_region(regions[prev_k].last, regions[prev_k].last,
                       HIGH_VAR_REGION, regions, num_regions, &prev_k);
-        analyze_region(stats, prev_k, regions, coeff);
+        analyze_region(stats, prev_k, regions);
         k = prev_k + 1;
-        analyze_region(stats, k, regions, coeff);
+        analyze_region(stats, k, regions);
       }
     }
     k++;
@@ -1793,16 +1748,13 @@ static void cleanup_blendings(REGIONS *regions, int *num_regions) {
 // pointing to.
 static void identify_regions(const FIRSTPASS_STATS *const stats_start,
                              int total_frames, int offset, REGIONS *regions,
-                             int *total_regions, double *cor_coeff) {
+                             int *total_regions) {
   int k;
   if (total_frames <= 1) return;
 
-  double *coeff = cor_coeff + offset;
-
   // store the initial decisions
   REGIONS temp_regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
   av1_zero_array(temp_regions, MAX_FIRSTPASS_ANALYSIS_FRAMES);
-  int is_flash[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
   // buffers for filtered stats
   double filt_intra_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
   double filt_coded_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
@@ -1810,32 +1762,28 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start,
 
   int cur_region = 0, this_start = 0, this_last;
 
-  // find possible flash frames
-  mark_flashes(stats_start, 0, total_frames - 1, is_flash);
-
-  // first get the obvious scenecuts
   int next_scenecut = -1;
-
   do {
+    // first get the obvious scenecuts
     next_scenecut =
-        find_next_scenecut(stats_start, this_start, total_frames - 1, is_flash);
+        find_next_scenecut(stats_start, this_start, total_frames - 1);
     this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
+
     // low-pass filter the needed stats
-    smooth_filter_stats(stats_start, is_flash, this_start, this_last,
-                        filt_intra_err, filt_coded_err);
+    smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err,
+                        filt_coded_err);
     get_gradient(filt_coded_err, this_start, this_last, grad_coded);
 
     // find tentative stable regions and unstable regions
-    int num_regions = find_stable_regions(stats_start, grad_coded, is_flash,
-                                          this_start, this_last, temp_regions);
-    adjust_unstable_region_bounds(stats_start, is_flash, grad_coded,
-                                  temp_regions, coeff, &num_regions);
+    int num_regions = find_stable_regions(stats_start, grad_coded, this_start,
+                                          this_last, temp_regions);
 
-    get_region_stats(stats_start, is_flash, temp_regions, coeff, num_regions);
+    adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions);
+
+    get_region_stats(stats_start, temp_regions, num_regions);
 
     // Try to identify blending regions in the unstable regions
-    find_blending_regions(stats_start, is_flash, temp_regions, &num_regions,
-                          coeff);
+    find_blending_regions(stats_start, temp_regions, &num_regions);
     cleanup_blendings(temp_regions, &num_regions);
 
     // The flash points should all be considered high variance points
@@ -1848,7 +1796,7 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start,
       int start = temp_regions[k].start;
       int last = temp_regions[k].last;
       for (int i = start; i <= last; i++) {
-        if (is_flash[i]) {
+        if (stats_start[i].is_flash) {
           insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
         }
       }
@@ -1858,6 +1806,11 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start,
 
     // copy the regions in the scenecut group
     for (k = 0; k < num_regions; k++) {
+      if (temp_regions[k].last < temp_regions[k].start &&
+          k == num_regions - 1) {
+        num_regions--;
+        break;
+      }
       regions[k + cur_region] = temp_regions[k];
     }
     cur_region += num_regions;
@@ -1874,17 +1827,21 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start,
   } while (next_scenecut >= 0);
 
   *total_regions = cur_region;
-  get_region_stats(stats_start, is_flash, regions, coeff, *total_regions);
+  get_region_stats(stats_start, regions, *total_regions);
 
   for (k = 0; k < *total_regions; k++) {
     // If scenecuts are very minor, mark them as high variance.
-    if (regions[k].type != SCENECUT_REGION || regions[k].avg_cor_coeff < 0.8) {
+    if (regions[k].type != SCENECUT_REGION ||
+        regions[k].avg_cor_coeff *
+                (1 - stats_start[regions[k].start].noise_var /
+                         regions[k].avg_intra_err) <
+            0.8) {
       continue;
     }
     regions[k].type = HIGH_VAR_REGION;
   }
   cleanup_regions(regions, total_regions);
-  get_region_stats(stats_start, is_flash, regions, coeff, *total_regions);
+  get_region_stats(stats_start, regions, *total_regions);
 
   for (k = 0; k < *total_regions; k++) {
     regions[k].start += offset;
@@ -1911,16 +1868,17 @@ static int find_regions_index(const REGIONS *regions, int num_regions,
  * \param[in]    max_gop_length   Maximum length of the GF group
  * \param[in]    max_intervals    Maximum number of intervals to decide
  *
- * \return Nothing is returned. Instead, cpi->rc.gf_intervals is
+ * \return Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
  * changed to store the decided GF group lengths.
  */
 static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
                                 int max_intervals) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  FRAME_INFO *frame_info = &cpi->frame_info;
+  const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0);
   int i;
 
   int flash_detected;
@@ -1930,9 +1888,9 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
 
   if (has_no_stats_stage(cpi)) {
     for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
-      rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+      p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
     }
-    rc->cur_gf_index = 0;
+    p_rc->cur_gf_index = 0;
     rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
     return;
   }
@@ -1944,17 +1902,17 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
   const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
 
   i = (rc->frames_since_key == 0);
-  max_intervals = cpi->lap_enabled ? 1 : max_intervals;
+  max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals;
   int count_cuts = 1;
   // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
-  int cur_start = -1 + !cpi->gf_state.arf_gf_boost_lst, cur_last;
+  int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last;
   int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
   int cut_here;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
   while (count_cuts < max_intervals + 1) {
     // reaches next key frame, break here
-    if (i >= rc->frames_to_key + rc->next_is_fwd_key) {
+    if (i >= rc->frames_to_key + p_rc->next_is_fwd_key) {
       cut_here = 2;
     } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
       // reached maximum len, but nothing special yet (almost static)
@@ -1969,7 +1927,7 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
       flash_detected = detect_flash(twopass, 0);
       // TODO(bohanli): remove redundant accumulations here, or unify
       // this and the ones in define_gf_group
-      accumulate_next_frame_stats(&next_frame, frame_info, flash_detected,
+      accumulate_next_frame_stats(&next_frame, flash_detected,
                                   rc->frames_since_key, i, &gf_stats);
 
       cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
@@ -1981,10 +1939,10 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
       int ori_last = cur_last;
       // The region frame idx does not start from the same frame as cur_start
       // and cur_last. Need to offset them.
-      int offset = rc->frames_since_key - rc->regions_offset;
-      REGIONS *regions = rc->regions;
-      int num_regions = rc->num_regions;
-      if (cpi->oxcf.kf_cfg.fwd_kf_enabled && rc->next_is_fwd_key) {
+      int offset = rc->frames_since_key - p_rc->regions_offset;
+      REGIONS *regions = p_rc->regions;
+      int num_regions = p_rc->num_regions;
+      if (cpi->oxcf.kf_cfg.fwd_kf_enabled && p_rc->next_is_fwd_key) {
         const int frames_left = rc->frames_to_key - i;
         const int min_int = AOMMIN(MIN_FWD_KF_INTERVAL, active_min_gf_interval);
         if (frames_left < min_int && frames_left > 0) {
@@ -2021,7 +1979,11 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
           // If we have a scenecut, then stop at it.
           // TODO(bohanli): add logic here to stop before the scenecut and for
           // the next gop start from the scenecut with GF
-          int is_minor_sc = (regions[scenecut_idx].avg_cor_coeff > 0.6);
+          int is_minor_sc =
+              (regions[scenecut_idx].avg_cor_coeff *
+                   (1 - stats[regions[scenecut_idx].start - offset].noise_var /
+                            regions[scenecut_idx].avg_intra_err) >
+               0.6);
           cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
         } else {
           int is_last_analysed = (k_last == num_regions - 1) &&
@@ -2032,45 +1994,91 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
           // if we are very close to the end, then do not shrink since it may
           // introduce intervals that are too short
           if (!(is_last_analysed && not_enough_regions)) {
-            int found = 0;
-            // first try to end at a stable area
-            for (int j = cur_last; j >= cur_start + min_shrink_int; j--) {
-              if (regions[find_regions_index(regions, num_regions, j + offset)]
-                      .type == STABLE_REGION) {
-                cur_last = j;
-                found = 1;
-                break;
-              }
+            const double arf_length_factor = 0.1;
+            double best_score = 0;
+            int best_j = -1;
+            const int first_frame = regions[0].start - offset;
+            const int last_frame = regions[num_regions - 1].last - offset;
+            // score of how much the arf helps the whole GOP
+            double base_score = 0.0;
+            // Accumulate base_score in
+            for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
             }
-            if (!found) {
-              // Could not find stable point,
-              // try to find an OK point (high correlation, not blending)
-              for (int j = cur_last; j >= cur_start + min_shrink_int; j--) {
-                REGIONS *cur_region =
-                    regions +
-                    find_regions_index(regions, num_regions, j + offset);
-                double avg_coeff = cur_region->avg_cor_coeff;
-                if (rc->cor_coeff[j + offset] > avg_coeff &&
-                    cur_region->type != BLENDING_REGION) {
-                  cur_last = j;
-                  found = 1;
+            int met_blending = 0;   // Whether we have met blending areas before
+            int last_blending = 0;  // Whether the previous frame if blending
+            for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
+              int this_reg =
+                  find_regions_index(regions, num_regions, j + offset);
+              if (this_reg < 0) continue;
+              // A GOP should include at most 1 blending region.
+              if (regions[this_reg].type == BLENDING_REGION) {
+                last_blending = 1;
+                if (met_blending) {
                   break;
+                } else {
+                  base_score = 0;
+                  continue;
                 }
+              } else {
+                if (last_blending) met_blending = 1;
+                last_blending = 0;
+              }
+
+              // Add the factor of how good the neighborhood is for this
+              // candidate arf.
+              double this_score = arf_length_factor * base_score;
+              double temp_accu_coeff = 1.0;
+              // following frames
+              int count_f = 0;
+              for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
+                if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
+                this_score +=
+                    temp_accu_coeff *
+                    (1 - stats[n].noise_var /
+                             AOMMAX(regions[this_reg].avg_intra_err, 0.001));
+                count_f++;
+              }
+              // preceding frames
+              temp_accu_coeff = 1.0;
+              for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+                if (stats + n < twopass->stats_buf_ctx->stats_in_start) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
+                this_score +=
+                    temp_accu_coeff *
+                    (1 - stats[n].noise_var /
+                             AOMMAX(regions[this_reg].avg_intra_err, 0.001));
+              }
+
+              if (this_score > best_score) {
+                best_score = this_score;
+                best_j = j;
               }
             }
-            if (!found) {
-              // Could not find a better point,
-              // try not to cut in blending areas
-              for (int j = cur_last; j >= cur_start + min_shrink_int; j--) {
-                REGIONS *cur_region =
-                    regions +
-                    find_regions_index(regions, num_regions, j + offset);
-                if (cur_region->type != BLENDING_REGION) {
-                  cur_last = j;
-                  break;
+
+            // For blending areas, move one more frame in case we missed the
+            // first blending frame.
+            int best_reg =
+                find_regions_index(regions, num_regions, best_j + offset);
+            if (best_reg < num_regions - 1 && best_reg > 0) {
+              if (regions[best_reg - 1].type == BLENDING_REGION &&
+                  regions[best_reg + 1].type == BLENDING_REGION) {
+                if (best_j + offset == regions[best_reg].start &&
+                    best_j + offset < regions[best_reg].last) {
+                  best_j += 1;
+                } else if (best_j + offset == regions[best_reg].last &&
+                           best_j + offset > regions[best_reg].start) {
+                  best_j -= 1;
                 }
               }
             }
+
+            if (cur_last - best_j < 2) best_j = cur_last;
+            if (best_j > 0 && best_score > 0.1) cur_last = best_j;
             // if cannot find anything, just cut at the original place.
           }
         }
@@ -2081,11 +2089,11 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
       // reset pointers to the shrinked location
       twopass->stats_in = start_pos + cur_last;
       cur_start = cur_last;
-      if (regions[find_regions_index(regions, num_regions,
-                                     cur_start + 1 + offset)]
-              .type == SCENECUT_REGION) {
-        cur_start++;
-      }
+      int cur_region_idx =
+          find_regions_index(regions, num_regions, cur_start + 1 + offset);
+      if (cur_region_idx >= 0)
+        if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++;
+
       i = cur_last;
 
       if (cut_here > 1 && cur_last == ori_last) break;
@@ -2099,9 +2107,9 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
   // save intervals
   rc->intervals_till_gf_calculate_due = count_cuts - 1;
   for (int n = 1; n < count_cuts; n++) {
-    rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
+    p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
   }
-  rc->cur_gf_index = 0;
+  p_rc->cur_gf_index = 0;
   twopass->stats_in = start_pos;
 }
 
@@ -2110,12 +2118,13 @@ static void correct_frames_to_key(AV1_COMP *cpi) {
       (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
   if (lookahead_size <
       av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) {
-    assert(IMPLIES(cpi->oxcf.pass != 0 && cpi->frames_left > 0,
-                   lookahead_size == cpi->frames_left));
+    assert(IMPLIES(cpi->oxcf.pass != 0 && cpi->ppi->frames_left > 0,
+                   lookahead_size == cpi->ppi->frames_left));
     cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
-  } else if (cpi->frames_left > 0) {
+  } else if (cpi->ppi->frames_left > 0) {
     // Correct frames to key based on limit
-    cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, cpi->frames_left);
+    cpi->rc.frames_to_key =
+        AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left);
   }
 }
 
@@ -2129,11 +2138,12 @@ static void correct_frames_to_key(AV1_COMP *cpi) {
  *
  * \param[in]    cpi             Top-level encoder structure
  *
- * \return Nothing is returned. Instead, cpi->gf_group is changed.
+ * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
  */
 static void define_gf_group_pass0(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const GFConfig *const gf_cfg = &oxcf->gf_cfg;
   int target;
@@ -2141,28 +2151,28 @@ static void define_gf_group_pass0(AV1_COMP *cpi) {
   if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
     av1_cyclic_refresh_set_golden_update(cpi);
   } else {
-    rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index];
+    p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index];
     rc->intervals_till_gf_calculate_due--;
-    rc->cur_gf_index++;
+    p_rc->cur_gf_index++;
   }
 
   // correct frames_to_key when lookahead queue is flushing
   correct_frames_to_key(cpi);
 
-  if (rc->baseline_gf_interval > rc->frames_to_key)
-    rc->baseline_gf_interval = rc->frames_to_key;
+  if (p_rc->baseline_gf_interval > rc->frames_to_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
 
-  rc->gfu_boost = DEFAULT_GF_BOOST;
-  rc->constrained_gf_group =
-      (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
 
   gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
 
   // Rare case when the look-ahead is less than the target GOP length, can't
   // generate ARF frame.
-  if (rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+  if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
       !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
-      rc->baseline_gf_interval < rc->min_gf_interval)
+      p_rc->baseline_gf_interval < rc->min_gf_interval)
     gf_group->max_layer_depth_allowed = 0;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
@@ -2194,7 +2204,8 @@ static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position,
                                             int use_alt_ref,
                                             int is_final_pass) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   // Set the interval until the next gf.
   // If forward keyframes are enabled, ensure the final gf group obeys the
   // MIN_FWD_KF_INTERVAL.
@@ -2203,27 +2214,28 @@ static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position,
       twopass->stats_buf_ctx->stats_in_end;
 
   if (cpi->oxcf.kf_cfg.fwd_kf_enabled && use_alt_ref && !is_last_kf &&
-      cpi->rc.next_is_fwd_key) {
+      cpi->ppi->p_rc.next_is_fwd_key) {
     if (arf_position == rc->frames_to_key + 1) {
-      rc->baseline_gf_interval = arf_position;
+      p_rc->baseline_gf_interval = arf_position;
       // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
     } else if (rc->frames_to_key + 1 - arf_position <
                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) {
       // if possible, merge the last two gf groups
       if (rc->frames_to_key + 1 <= active_max_gf_interval) {
-        rc->baseline_gf_interval = rc->frames_to_key + 1;
+        p_rc->baseline_gf_interval = rc->frames_to_key + 1;
         if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
         // if merging the last two gf groups creates a group that is too long,
         // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
       } else {
-        rc->baseline_gf_interval = rc->frames_to_key + 1 - MIN_FWD_KF_INTERVAL;
+        p_rc->baseline_gf_interval =
+            rc->frames_to_key + 1 - MIN_FWD_KF_INTERVAL;
         if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
       }
     } else {
-      rc->baseline_gf_interval = arf_position;
+      p_rc->baseline_gf_interval = arf_position;
     }
   } else {
-    rc->baseline_gf_interval = arf_position;
+    p_rc->baseline_gf_interval = arf_position;
   }
 }
 
@@ -2269,18 +2281,19 @@ static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
  * \param[in]    is_final_pass   Whether this is the final pass for the
  *                               GF group, or a trial (non-zero)
  *
- * \return Nothing is returned. Instead, cpi->gf_group is changed.
+ * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
  */
 static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
                             EncodeFrameParams *frame_params, int max_gop_length,
                             int is_final_pass) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   FRAME_INFO *frame_info = &cpi->frame_info;
   const GFConfig *const gf_cfg = &oxcf->gf_cfg;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
@@ -2289,12 +2302,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   int64_t gf_group_bits;
   const int is_intra_only = rc->frames_since_key == 0;
 
-  cpi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+  cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (!is_intra_only) {
-    av1_zero(cpi->gf_group);
+    av1_zero(cpi->ppi->gf_group);
+    cpi->gf_frame_index = 0;
   }
 
   aom_clear_system_state();
@@ -2306,7 +2320,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   }
 
   // correct frames_to_key when lookahead queue is emptying
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     correct_frames_to_key(cpi);
   }
 
@@ -2336,8 +2350,8 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
       AOMMIN(rc->max_gf_interval, max_gop_length);
 
   i = is_intra_only;
-  // get the determined gf group length from rc->gf_intervals
-  while (i < rc->gf_intervals[rc->cur_gf_index]) {
+  // get the determined gf group length from p_rc->gf_intervals
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
     // read in the next frame
     if (EOF == input_stats(twopass, &next_frame)) break;
     // Accumulate error score of frames in this gf group.
@@ -2360,7 +2374,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
 
   i = is_intra_only;
   input_stats(twopass, &next_frame);
-  while (i < rc->gf_intervals[rc->cur_gf_index]) {
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
     // read in the next frame
     if (EOF == input_stats(twopass, &next_frame)) break;
 
@@ -2369,13 +2383,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
     flash_detected = detect_flash(twopass, 0);
 
     // accumulate stats for next frame
-    accumulate_next_frame_stats(&next_frame, frame_info, flash_detected,
+    accumulate_next_frame_stats(&next_frame, flash_detected,
                                 rc->frames_since_key, i, &gf_stats);
 
     ++i;
   }
 
-  i = rc->gf_intervals[rc->cur_gf_index];
+  i = p_rc->gf_intervals[p_rc->cur_gf_index];
 
   // save the errs for the last frame
   last_frame_stats.frame_coded_error = next_frame.coded_error;
@@ -2384,11 +2398,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
 
   if (is_final_pass) {
     rc->intervals_till_gf_calculate_due--;
-    rc->cur_gf_index++;
+    p_rc->cur_gf_index++;
   }
 
   // Was the group length constrained by the requirement for a new KF?
-  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+  p_rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
   const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
                           ? cpi->initial_mbs
@@ -2407,32 +2421,34 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
       gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
       gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
       gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
-    cpi->internal_altref_allowed = 0;
+    cpi->ppi->internal_altref_allowed = 0;
   }
 
   int use_alt_ref;
   if (can_disable_arf) {
     use_alt_ref =
         !is_almost_static(gf_stats.zero_motion_accumulator,
-                          twopass->kf_zeromotion_pct, cpi->lap_enabled) &&
-        rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+                          twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) &&
+        p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
         (i >= MIN_GF_INTERVAL);
 
+    FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
     // TODO(urvang): Improve and use model for VBR, CQ etc as well.
-    if (use_alt_ref && rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200) {
+    if (use_alt_ref && use_ml_model_to_decide_flat_gop(rc_cfg) &&
+        !is_fp_stats_to_predict_flat_gop_invalid(total_stats)) {
       aom_clear_system_state();
       float features[21];
       get_features_from_gf_stats(
           &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs,
-          rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features);
+          p_rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features);
       // Infer using ML model.
       float score;
       av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
       use_alt_ref = (score <= 0.0);
     }
   } else {
-    use_alt_ref =
-        rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i > 2);
+    use_alt_ref = p_rc->use_arf_in_this_kf_group &&
+                  (i < gf_cfg->lag_in_frames) && (i > 2);
   }
 
 #define REDUCE_GF_LENGTH_THRESH 4
@@ -2443,7 +2459,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   // work well for certain other cases.
   const int allow_gf_length_reduction =
       ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) ||
-       !cpi->internal_altref_allowed) &&
+       !cpi->ppi->internal_altref_allowed) &&
       !is_lossless_requested(rc_cfg);
 
   if (allow_gf_length_reduction && use_alt_ref) {
@@ -2485,48 +2501,48 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
                                    : AOMMAX(0, rc->frames_to_key - i);
 
     // Calculate the boost for alt ref.
-    rc->gfu_boost = av1_calc_arf_boost(
-        twopass, rc, frame_info, alt_offset, forward_frames, ext_len,
-        cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
-        cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL);
+    p_rc->gfu_boost = av1_calc_arf_boost(
+        twopass, p_rc, rc, frame_info, alt_offset, forward_frames, ext_len,
+        &p_rc->num_stats_used_for_gfu_boost,
+        &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled);
   } else {
     reset_fpf_position(twopass, start_pos);
     gf_group->max_layer_depth_allowed = 0;
     set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
                              is_final_pass);
 
-    rc->gfu_boost = AOMMIN(
+    p_rc->gfu_boost = AOMMIN(
         MAX_GF_BOOST,
-        av1_calc_arf_boost(
-            twopass, rc, frame_info, alt_offset, ext_len, 0,
-            cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
-            cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL));
+        av1_calc_arf_boost(twopass, p_rc, rc, frame_info, alt_offset, ext_len,
+                           0, &p_rc->num_stats_used_for_gfu_boost,
+                           &p_rc->num_stats_required_for_gfu_boost,
+                           cpi->ppi->lap_enabled));
   }
 
 #define LAST_ALR_BOOST_FACTOR 0.2f
-  rc->arf_boost_factor = 1.0;
+  p_rc->arf_boost_factor = 1.0;
   if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
     // Reduce the boost of altref in the last gf group
     if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
         rc->frames_to_key - ext_len == 0) {
-      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+      p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
     }
   }
 
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     // Since we don't have enough stats to know the actual error of the
     // gf group, we assume error of each frame to be equal to 1 and set
     // the error of the group as baseline_gf_interval.
-    gf_stats.gf_group_err = rc->baseline_gf_interval;
+    gf_stats.gf_group_err = p_rc->baseline_gf_interval;
   }
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err);
-  rc->gf_group_bits = gf_group_bits;
+  p_rc->gf_group_bits = gf_group_bits;
 
 #if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
@@ -2534,17 +2550,17 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
-  if ((rc_cfg->mode != AOM_Q) && (rc->baseline_gf_interval > 1) &&
+  if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) &&
       is_final_pass) {
     const int vbr_group_bits_per_frame =
-        (int)(gf_group_bits / rc->baseline_gf_interval);
+        (int)(gf_group_bits / p_rc->baseline_gf_interval);
     const double group_av_err =
-        gf_stats.gf_group_raw_error / rc->baseline_gf_interval;
+        gf_stats.gf_group_raw_error / p_rc->baseline_gf_interval;
     const double group_av_skip_pct =
-        gf_stats.gf_group_skip_pct / rc->baseline_gf_interval;
+        gf_stats.gf_group_skip_pct / p_rc->baseline_gf_interval;
     const double group_av_inactive_zone =
         ((gf_stats.gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+         (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
 
     int tmp_q;
     tmp_q = get_twopass_worst_quality(
@@ -2568,7 +2584,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   if (rc->frames_since_key != 0) {
     twopass->section_intra_rating = calculate_section_intra_ratio(
         start_pos, twopass->stats_buf_ctx->stats_in_end,
-        rc->baseline_gf_interval);
+        p_rc->baseline_gf_interval);
   }
 
   av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
@@ -2577,12 +2593,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   frame_params->frame_type =
       rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME;
   frame_params->show_frame =
-      !(gf_group->update_type[gf_group->index] == ARF_UPDATE ||
-        gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE);
+      !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+        gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
 
   // TODO(jingning): Generalize this condition.
   if (is_final_pass) {
-    cpi->gf_state.arf_gf_boost_lst = use_alt_ref;
+    cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref;
 
     // Reset rolling actual and target bits counters for ARF groups.
     twopass->rolling_arf_group_target_bits = 1;
@@ -2597,12 +2613,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                             GF_GROUP *gf_group, int is_key_frame, int use_arf,
                             int64_t gf_group_bits) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   // Calculate the extra bits to be used for boosted frame(s)
 #ifdef FIXED_ARF_BITS
   int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
 #else
   int gf_arf_bits = calculate_boost_bits(
-      rc->baseline_gf_interval - (rc->frames_since_key == 0), rc->gfu_boost,
+      p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost,
       gf_group_bits);
 #endif
 
@@ -2610,8 +2627,8 @@ void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                                                    gf_group_bits, 1);
 
   // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame,
-                         use_arf);
+  allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits,
+                         is_key_frame, use_arf);
 }
 
 // Minimum % intra coding observed in first pass (1.0 = 100%)
@@ -2786,10 +2803,10 @@ static int test_candidate_kf(TWO_PASS *twopass,
 #define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
 
 static int detect_app_forced_key(AV1_COMP *cpi) {
-  if (cpi->oxcf.kf_cfg.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
+  if (cpi->oxcf.kf_cfg.fwd_kf_enabled) cpi->ppi->p_rc.next_is_fwd_key = 1;
   int num_frames_to_app_forced_key = is_forced_keyframe_pending(
       cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage);
-  if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0;
+  if (num_frames_to_app_forced_key != -1) cpi->ppi->p_rc.next_is_fwd_key = 0;
   return num_frames_to_app_forced_key;
 }
 
@@ -2799,16 +2816,16 @@ static int get_projected_kf_boost(AV1_COMP *cpi) {
    * all stats needed for prior boost calculation are available.
    * Hence projecting the prior boost is not needed in this cases.
    */
-  if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
-    return cpi->rc.kf_boost;
+  if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+    return cpi->ppi->p_rc.kf_boost;
 
   // Get the current tpl factor (number of frames = frames_to_key).
   double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
   // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
-  double tpl_factor_num_stats =
-      av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost);
+  double tpl_factor_num_stats = av1_get_kf_boost_projection_factor(
+      cpi->ppi->p_rc.num_stats_used_for_kf_boost);
   int projected_kf_boost =
-      (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats);
+      (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats);
   return projected_kf_boost;
 }
 
@@ -2828,8 +2845,9 @@ static int get_projected_kf_boost(AV1_COMP *cpi) {
 static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
                               double *kf_group_err,
                               int num_frames_to_detect_scenecut) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
@@ -2874,7 +2892,7 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
-    if ((cpi->rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
+    if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
         twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) {
       double loop_decay_rate;
 
@@ -2882,14 +2900,13 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
       if (frames_since_key >= kf_cfg->key_freq_min &&
           test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
                             frames_since_key, oxcf->rc_cfg.mode,
-                            cpi->rc.enable_scenecut_detection)) {
+                            cpi->ppi->p_rc.enable_scenecut_detection)) {
         scenecut_detected = 1;
         break;
       }
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate =
-          get_prediction_decay_rate(frame_info, twopass->stats_in);
+      loop_decay_rate = get_prediction_decay_rate(twopass->stats_in);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -2909,7 +2926,7 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
         // In the case of transition followed by a static scene, the key frame
         // could be a good predictor for the following frames, therefore we
         // do not use an arf.
-        rc->use_arf_in_this_kf_group = 0;
+        p_rc->use_arf_in_this_kf_group = 0;
         break;
       }
 
@@ -2928,14 +2945,14 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   }
 
   if (kf_group_err != NULL)
-    rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost;
+    p_rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost;
 
-  if (cpi->lap_enabled && !scenecut_detected)
+  if (cpi->ppi->lap_enabled && !scenecut_detected)
     frames_to_key = num_frames_to_next_key;
 
   if (!kf_cfg->fwd_kf_enabled || scenecut_detected ||
       twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end)
-    rc->next_is_fwd_key = 0;
+    p_rc->next_is_fwd_key = 0;
 
   return frames_to_key;
 }
@@ -2964,9 +2981,9 @@ static double get_kf_group_avg_error(TWO_PASS *twopass,
 static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
                                  double kf_group_avg_error) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   int64_t kf_group_bits;
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
     if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
       const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
@@ -2990,7 +3007,7 @@ static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
 
 static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS cur_frame;
   av1_zero(cur_frame);
   int num_frames = 0;
@@ -3039,7 +3056,7 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
                                  double *zero_motion_accumulator,
                                  double *sr_accumulator, int use_avg_stat) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   FIRSTPASS_STATS frame_stat;
   av1_zero(frame_stat);
@@ -3061,8 +3078,7 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
     // For the first frame in kf group, the second ref indicator is invalid.
     if (i > 0) {
       *zero_motion_accumulator =
-          AOMMIN(*zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, &frame_stat));
+          AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat));
     } else {
       *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
     }
@@ -3102,8 +3118,9 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
  */
 static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
@@ -3115,27 +3132,26 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   rc->frames_since_key = 0;
   // Use arfs if possible.
-  rc->use_arf_in_this_kf_group = is_altref_enabled(
+  p_rc->use_arf_in_this_kf_group = is_altref_enabled(
       oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
 
   // Reset the GF group data structures.
   av1_zero(*gf_group);
+  cpi->gf_frame_index = 0;
 
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
-  rc->frames_to_key = 1;
-
   if (has_no_stats_stage(cpi)) {
     int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
-    rc->this_key_frame_forced =
+    p_rc->this_key_frame_forced =
         current_frame->frame_number != 0 && rc->frames_to_key == 0;
     if (num_frames_to_app_forced_key != -1)
       rc->frames_to_key = num_frames_to_app_forced_key;
     else
       rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
     correct_frames_to_key(cpi);
-    rc->kf_boost = DEFAULT_KF_BOOST;
+    p_rc->kf_boost = DEFAULT_KF_BOOST;
     gf_group->update_type[0] = KF_UPDATE;
     return;
   }
@@ -3153,7 +3169,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int64_t kf_group_bits_clipped = INT64_MAX;
 
   // Is this a forced key frame by interval.
-  rc->this_key_frame_forced = rc->next_key_frame_forced;
+  p_rc->this_key_frame_forced = p_rc->next_key_frame_forced;
 
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
@@ -3169,7 +3185,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   else
     rc->frames_to_key = kf_cfg->key_freq_max;
 
-  if (cpi->lap_enabled) correct_frames_to_key(cpi);
+  if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
@@ -3191,28 +3207,29 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
           calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame);
       if (EOF == input_stats(twopass, &tmp_frame)) break;
     }
-    rc->next_key_frame_forced = 1;
+    p_rc->next_key_frame_forced = 1;
   } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end &&
               is_stat_consumption_stage_twopass(cpi)) ||
              rc->frames_to_key >= kf_cfg->key_freq_max) {
-    rc->next_key_frame_forced = 1;
+    p_rc->next_key_frame_forced = 1;
   } else {
-    rc->next_key_frame_forced = 0;
+    p_rc->next_key_frame_forced = 0;
   }
 
-  if (kf_cfg->fwd_kf_enabled) rc->next_is_fwd_key |= rc->next_key_frame_forced;
+  if (kf_cfg->fwd_kf_enabled)
+    p_rc->next_is_fwd_key |= p_rc->next_key_frame_forced;
 
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) {
     // Accumulate kf group error.
     kf_group_err +=
         calculate_modified_err(frame_info, twopass, oxcf, this_frame);
-    rc->next_is_fwd_key = 0;
+    p_rc->next_is_fwd_key = 0;
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
   if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
-      (cpi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
+      (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
     // Maximum number of bits for a single normal frame (not key frame).
     const int max_bits = frame_max_bits(rc, oxcf);
 
@@ -3237,7 +3254,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
   twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
 
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     // In the case of single pass based on LAP, frames to  key may have an
     // inaccurate value, and hence should be clipped to an appropriate
     // interval.
@@ -3268,17 +3285,17 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
 
-  rc->kf_boost = (int)boost_score;
+  p_rc->kf_boost = (int)boost_score;
 
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     if (oxcf->rc_cfg.mode == AOM_Q) {
-      rc->kf_boost = get_projected_kf_boost(cpi);
+      p_rc->kf_boost = get_projected_kf_boost(cpi);
     } else {
       // TODO(any): Explore using average frame stats for AOM_Q as well.
       boost_score = get_kf_boost_score(
           cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
       reset_fpf_position(twopass, start_position);
-      rc->kf_boost += (int)boost_score;
+      p_rc->kf_boost += (int)boost_score;
     }
   }
 
@@ -3286,13 +3303,13 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // if the kf group is very short.
   if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
       (rc->frames_to_key > 8)) {
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST);
   } else {
     // Apply various clamps for min and max boost
-    rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3));
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST);
 #ifdef STRICT_RC
-    rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST);
+    p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST);
 #endif
   }
 
@@ -3301,9 +3318,10 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // very high, we calculate the bits based on a clipped value of
   // frames_to_key.
   kf_bits = calculate_boost_bits(
-      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, rc->kf_boost,
+      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost,
       AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
-  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n",
+  // p_rc->kf_boost,
   //        kf_bits, twopass->kf_zeromotion_pct);
   kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
                                                twopass->kf_group_bits, 0);
@@ -3315,7 +3333,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   gf_group->update_type[0] = KF_UPDATE;
 
   // Note the total error score of the kf group minus the key frame itself.
-  if (cpi->lap_enabled)
+  if (cpi->ppi->lap_enabled)
     // As we don't have enough stats to know the actual error of the group,
     // we assume the complexity of each frame to be equal to 1, and set the
     // error as the number of frames in the group(minus the keyframe).
@@ -3335,7 +3353,7 @@ static int is_skippable_frame(const AV1_COMP *cpi) {
   // first  pass, and so do its previous and forward frames, then this frame
   // can be skipped for partition check, and the partition size is assigned
   // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
 
   return (!frame_is_intra_only(&cpi->common) &&
           twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start &&
@@ -3358,34 +3376,78 @@ static int get_section_target_bandwidth(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   int section_target_bandwidth;
   const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
                                 current_frame->frame_number);
-  if (cpi->lap_enabled)
+  if (cpi->ppi->lap_enabled)
     section_target_bandwidth = (int)rc->avg_frame_bandwidth;
   else
     section_target_bandwidth = (int)(twopass->bits_left / frames_left);
   return section_target_bandwidth;
 }
 
+static INLINE void set_twopass_params_based_on_fp_stats(
+    const AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
+  if (this_frame_ptr == NULL) return;
+
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass->mb_av_energy = log((this_frame_ptr->intra_error / num_mbs) + 1.0);
+
+  const FIRSTPASS_STATS *const total_stats =
+      twopass->stats_buf_ctx->total_stats;
+  if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+    twopass->frame_avg_haar_energy =
+        log((this_frame_ptr->frame_avg_wavelet_energy / num_mbs) + 1.0);
+  }
+
+  // Set the frame content type flag.
+  if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+}
+
 static void process_first_pass_stats(AV1_COMP *cpi,
                                      FIRSTPASS_STATS *this_frame) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+
+  if (current_frame->frame_number == 0) {
+    const GFConfig *const gf_cfg = &cpi->oxcf.gf_cfg;
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    if (use_ml_model_to_decide_flat_gop(rc_cfg) && can_disable_altref(gf_cfg) &&
+        is_fp_stats_to_predict_flat_gop_invalid(total_stats)) {
+      // warn(
+      //     "First pass stats required in the ML model to predict a flat GOP "
+      //     "structure is invalid. Continuing encoding by disabling the ML "
+      //     "model.\n");
+      // The first pass statistics like tr_coded_error, pcnt_third_ref,
+      // frame_avg_wavelet_energy are invalid as their calculations were
+      // skipped in the first pass of encoding. As these stats are required
+      // in the ML model to predict a flat GOP structure, the ML model would be
+      // disabled. This case arises when the encode configuration used in first
+      // pass encoding is different from second pass encoding.
+    }
+  }
 
   if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
-      cpi->gf_group.index == 0 && cpi->twopass.stats_buf_ctx->total_stats &&
-      cpi->twopass.stats_buf_ctx->total_left_stats) {
-    if (cpi->lap_enabled) {
+      cpi->gf_frame_index == 0 && total_stats &&
+      cpi->ppi->twopass.stats_buf_ctx->total_left_stats) {
+    if (cpi->ppi->lap_enabled) {
       /*
        * Accumulate total_stats using available limited number of stats,
        * and assign it to total_left_stats.
        */
-      *cpi->twopass.stats_buf_ctx->total_left_stats =
-          *cpi->twopass.stats_buf_ctx->total_stats;
+      *cpi->ppi->twopass.stats_buf_ctx->total_left_stats = *total_stats;
     }
     // Special case code for first frame.
     const int section_target_bandwidth = get_section_target_bandwidth(cpi);
@@ -3406,43 +3468,25 @@ static void process_first_pass_stats(AV1_COMP *cpi,
     rc->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->last_q[INTER_FRAME] = tmp_q;
-    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
+    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth);
     rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
     rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
 
-  int err = 0;
-  if (cpi->lap_enabled) {
-    err = input_stats_lap(twopass, this_frame);
+  if (cpi->ppi->lap_enabled) {
+    input_stats_lap(twopass, this_frame);
   } else {
-    err = input_stats(twopass, this_frame);
-  }
-  if (err == EOF) return;
-
-  {
-    const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cm->mi_params.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0);
-    twopass->frame_avg_haar_energy =
-        log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0);
+    input_stats(twopass, this_frame);
   }
-
-  // Set the frame content type flag.
-  if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH)
-    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
-  else
-    twopass->fr_content_type = FC_NORMAL;
+  set_twopass_params_based_on_fp_stats(cpi, this_frame);
 }
 
 static void setup_target_rate(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
 
-  int target_rate = gf_group->bit_allocation[gf_group->index];
+  int target_rate = gf_group->bit_allocation[cpi->gf_frame_index];
 
   if (has_no_stats_stage(cpi)) {
     av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
@@ -3452,24 +3496,160 @@ static void setup_target_rate(AV1_COMP *cpi) {
   rc->base_frame_target = target_rate;
 }
 
+static void mark_flashes(FIRSTPASS_STATS *first_stats,
+                         FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
+  while (this_stats < last_stats - 1) {
+    next_stats = this_stats + 1;
+    if (next_stats->pcnt_second_ref > next_stats->pcnt_inter &&
+        next_stats->pcnt_second_ref >= 0.5) {
+      this_stats->is_flash = 1;
+    } else {
+      this_stats->is_flash = 0;
+    }
+    this_stats = next_stats;
+  }
+  // We always treat the last one as none flash.
+  if (last_stats - 1 >= first_stats) {
+    (last_stats - 1)->is_flash = 0;
+  }
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+static void estimate_noise(FIRSTPASS_STATS *first_stats,
+                           FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats, *next_stats;
+  double C1, C2, C3, noise;
+  int count = 0;
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    this_stats->noise_var = 0.0;
+    // flashes tend to have high correlation of innovations, so ignore them.
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+
+    C1 = (this_stats - 1)->intra_error *
+         (this_stats->intra_error - this_stats->coded_error);
+    C2 = (this_stats - 2)->intra_error *
+         ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error);
+    C3 = (this_stats - 2)->intra_error *
+         (this_stats->intra_error - this_stats->sr_coded_error);
+    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+    C1 = sqrt(C1);
+    C2 = sqrt(C2);
+    C3 = sqrt(C3);
+
+    noise = (this_stats - 1)->intra_error - C1 * C2 / C3;
+    noise = AOMMAX(noise, 0.01);
+    this_stats->noise_var = noise;
+    count++;
+  }
+
+  // Copy noise from the neighbor if the noise value is not trustworthy
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+    if (this_stats->noise_var < 1.0) {
+      int found = 0;
+      // TODO(bohanli): consider expanding to two directions at the same time
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // copy the noise if this is a flash
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash) {
+      int found = 0;
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // if we are at the first 2 frames, copy the noise
+  for (this_stats = first_stats;
+       this_stats < first_stats + 2 && (first_stats + 2) < last_stats;
+       this_stats++) {
+    this_stats->noise_var = (first_stats + 2)->noise_var;
+  }
+}
+
+// Estimate correlation coefficient of each frame with its previous frame.
+static void estimate_coeff(FIRSTPASS_STATS *first_stats,
+                           FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats;
+  for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
+    const double C =
+        sqrt(AOMMAX((this_stats - 1)->intra_error *
+                        (this_stats->intra_error - this_stats->coded_error),
+                    0.001));
+    const double cor_coeff =
+        C /
+        AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001);
+
+    this_stats->cor_coeff =
+        cor_coeff *
+        sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var,
+                    0.001) /
+             AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001));
+    // clip correlation coefficient.
+    this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1);
+  }
+  first_stats->cor_coeff = 1.0;
+}
+
 void av1_get_second_pass_params(AV1_COMP *cpi,
                                 EncodeFrameParams *const frame_params,
                                 const EncodeFrameInput *const frame_input,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
 
   if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return;
 
-  const int update_type = gf_group->update_type[gf_group->index];
-  frame_params->frame_type = gf_group->frame_type[gf_group->index];
+  assert(twopass->stats_in != NULL);
+  const int update_type = gf_group->update_type[cpi->gf_frame_index];
+  frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
 
-  if (gf_group->index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
-    assert(gf_group->index < gf_group->size);
+  if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
+    assert(cpi->gf_frame_index < gf_group->size);
 
     setup_target_rate(cpi);
 
@@ -3481,6 +3661,9 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
       if (cpi->sf.part_sf.allow_partition_search_skip && oxcf->pass == 2) {
         cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
       }
+      const FIRSTPASS_STATS *const this_frame_ptr = read_frame_stats(
+          twopass, gf_group->arf_src_offset[cpi->gf_frame_index]);
+      set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
       return;
     }
   }
@@ -3493,7 +3676,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
   av1_zero(this_frame);
   // call above fn
   if (is_stat_consumption_stage(cpi)) {
-    if (gf_group->index < gf_group->size || rc->frames_to_key == 0)
+    if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0)
       process_first_pass_stats(cpi, &this_frame);
   } else {
     rc->active_worst_quality = oxcf->rc_cfg.cq_level;
@@ -3504,7 +3687,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
   this_frame_copy = this_frame;
   int is_overlay_forward_kf =
       rc->frames_to_key == 0 &&
-      gf_group->update_type[gf_group->index] == OVERLAY_UPDATE;
+      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE;
   if (rc->frames_to_key <= 0 && !is_overlay_forward_kf) {
     assert(rc->frames_to_key >= -1);
     // Define next KF group and assign bits to it.
@@ -3554,12 +3737,12 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
   }
 
   // Define a new GF/ARF group. (Should always enter here for key frames).
-  if (gf_group->index == gf_group->size) {
+  if (cpi->gf_frame_index == gf_group->size) {
     assert(cpi->common.current_frame.frame_number == 0 ||
-           gf_group->index == gf_group->size);
+           cpi->gf_frame_index == gf_group->size);
     const FIRSTPASS_STATS *const start_position = twopass->stats_in;
 
-    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection) {
+    if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) {
       int num_frames_to_detect_scenecut, frames_to_key;
       num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
       frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
@@ -3578,41 +3761,45 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
             : MAX_GF_LENGTH_LAP;
 
     // Identify regions if needed.
+    // TODO(bohanli): identify regions for all stats available.
     if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
-        (rc->frames_till_regions_update - rc->frames_since_key <
+        (p_rc->frames_till_regions_update - rc->frames_since_key <
              rc->frames_to_key &&
-         rc->frames_till_regions_update - rc->frames_since_key <
+         p_rc->frames_till_regions_update - rc->frames_since_key <
              max_gop_length + 1)) {
-      int is_first_stat =
-          twopass->stats_in == twopass->stats_buf_ctx->stats_in_start;
-      const FIRSTPASS_STATS *stats_start = twopass->stats_in + is_first_stat;
-      // offset of stats_start from the current frame
-      int offset = is_first_stat || (rc->frames_since_key == 0);
-      // offset of the region indices from the previous key frame
-      rc->regions_offset = rc->frames_since_key;
       // how many frames we can analyze from this frame
-      int rest_frames = AOMMIN(rc->frames_to_key + rc->next_is_fwd_key,
+      int rest_frames = AOMMIN(rc->frames_to_key + p_rc->next_is_fwd_key,
                                MAX_FIRSTPASS_ANALYSIS_FRAMES);
-      rest_frames =
-          AOMMIN(rest_frames,
-                 (int)(twopass->stats_buf_ctx->stats_in_end - stats_start + 1) +
-                     offset);
-
-      rc->frames_till_regions_update = rest_frames;
-
-      identify_regions(stats_start, rest_frames - offset, offset, rc->regions,
-                       &rc->num_regions, rc->cor_coeff);
+      rest_frames = AOMMIN(
+          rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end -
+                             twopass->stats_in + (rc->frames_since_key == 0)));
+      p_rc->frames_till_regions_update = rest_frames;
+
+      if (cpi->ppi->lap_enabled) {
+        mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end);
+        estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                       twopass->stats_buf_ctx->stats_in_end);
+        estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                       twopass->stats_buf_ctx->stats_in_end);
+        identify_regions(twopass->stats_in, rest_frames,
+                         (rc->frames_since_key == 0), p_rc->regions,
+                         &p_rc->num_regions);
+      } else {
+        identify_regions(twopass->stats_in - (rc->frames_since_key == 0),
+                         rest_frames, 0, p_rc->regions, &p_rc->num_regions);
+      }
     }
 
     int cur_region_idx =
-        find_regions_index(rc->regions, rc->num_regions,
-                           rc->frames_since_key - rc->regions_offset);
+        find_regions_index(p_rc->regions, p_rc->num_regions,
+                           rc->frames_since_key - p_rc->regions_offset);
     if ((cur_region_idx >= 0 &&
-         rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+         p_rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
         rc->frames_since_key == 0) {
       // If we start from a scenecut, then the last GOP's arf boost is not
       // needed for this GOP.
-      cpi->gf_state.arf_gf_boost_lst = 0;
+      cpi->ppi->gf_state.arf_gf_boost_lst = 0;
     }
 
     // TODO(jingning): Resoleve the redundant calls here.
@@ -3621,62 +3808,49 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
     }
 
     if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
-        !cpi->sf.tpl_sf.disable_gop_length_decision) {
-      int this_idx = rc->frames_since_key + rc->gf_intervals[rc->cur_gf_index] -
-                     rc->regions_offset - 1;
+        cpi->sf.tpl_sf.gop_length_decision_method != 3) {
+      int this_idx = rc->frames_since_key +
+                     p_rc->gf_intervals[p_rc->cur_gf_index] -
+                     p_rc->regions_offset - 1;
       int this_region =
-          find_regions_index(rc->regions, rc->num_regions, this_idx);
+          find_regions_index(p_rc->regions, p_rc->num_regions, this_idx);
       int next_region =
-          find_regions_index(rc->regions, rc->num_regions, this_idx + 1);
+          find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1);
       int is_last_scenecut =
-          (rc->gf_intervals[rc->cur_gf_index] >= rc->frames_to_key ||
-           rc->regions[this_region].type == SCENECUT_REGION ||
-           rc->regions[next_region].type == SCENECUT_REGION);
-      int ori_gf_int = rc->gf_intervals[rc->cur_gf_index];
+          (p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key ||
+           p_rc->regions[this_region].type == SCENECUT_REGION ||
+           p_rc->regions[next_region].type == SCENECUT_REGION);
+      int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index];
 
-      if (rc->gf_intervals[rc->cur_gf_index] > 16) {
+      if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 &&
+          rc->min_gf_interval <= 16) {
         // The calculate_gf_length function is previously used with
         // max_gop_length = 32 with look-ahead gf intervals.
         define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
         this_frame = this_frame_copy;
-        int is_temporal_filter_enabled =
-            (rc->frames_since_key > 0 && gf_group->arf_index > -1);
-        if (is_temporal_filter_enabled) {
-          int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
-          FRAME_UPDATE_TYPE arf_update_type =
-              gf_group->update_type[gf_group->arf_index];
-          int is_forward_keyframe = 0;
-          av1_temporal_filter(cpi, arf_src_index, arf_update_type,
-                              is_forward_keyframe, NULL);
-          aom_extend_frame_borders(&cpi->alt_ref_buffer,
-                                   av1_num_planes(&cpi->common));
-        }
-        if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) {
-          // Tpl decides that a shorter gf interval is better.
+
+        if (is_shorter_gf_interval_better(cpi, frame_params, frame_input)) {
+          // A shorter gf interval is better.
           // TODO(jingning): Remove redundant computations here.
           max_gop_length = 16;
           calculate_gf_length(cpi, max_gop_length, 1);
           if (is_last_scenecut &&
-              (ori_gf_int - rc->gf_intervals[rc->cur_gf_index] < 4)) {
-            rc->gf_intervals[rc->cur_gf_index] = ori_gf_int;
+              (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) {
+            p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int;
           }
-        } else {
-          // Tpl stats is reused only when the ARF frame is temporally filtered
-          if (is_temporal_filter_enabled)
-            cpi->tpl_data.skip_tpl_setup_stats = 1;
         }
       }
     }
     define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
 
-    if (gf_group->update_type[gf_group->index] != ARF_UPDATE &&
+    if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE &&
         rc->frames_since_key > 0)
       process_first_pass_stats(cpi, &this_frame);
 
     define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
 
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    assert(gf_group->index == 0);
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    assert(cpi->gf_frame_index == 0);
 #if ARF_STATS_OUTPUT
     {
       FILE *fpfile;
@@ -3684,18 +3858,22 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
       ++arf_count;
       fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
               cpi->common.current_frame.frame_number,
-              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
-              rc->gfu_boost);
+              rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count,
+              p_rc->gfu_boost);
 
       fclose(fpfile);
     }
 #endif
   }
-  assert(gf_group->index < gf_group->size);
+  assert(cpi->gf_frame_index < gf_group->size);
 
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
-      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
     reset_fpf_position(twopass, start_pos);
+
+    const FIRSTPASS_STATS *const this_frame_ptr = read_frame_stats(
+        twopass, gf_group->arf_src_offset[cpi->gf_frame_index]);
+    set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
   } else {
     // Update the total stats remaining structure.
     if (twopass->stats_buf_ctx->total_left_stats)
@@ -3703,7 +3881,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
                      &this_frame_copy);
   }
 
-  frame_params->frame_type = gf_group->frame_type[gf_group->index];
+  frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
 
   // Do the firstpass stats indicate that this frame is skippable for the
   // partition search?
@@ -3716,13 +3894,20 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
 
 void av1_init_second_pass(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   double frame_rate;
   FIRSTPASS_STATS *stats;
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
+  mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+               twopass->stats_buf_ctx->stats_in_end);
+  estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                 twopass->stats_buf_ctx->stats_in_end);
+  estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                 twopass->stats_buf_ctx->stats_in_end);
+
   stats = twopass->stats_buf_ctx->total_stats;
 
   *stats = *twopass->stats_buf_ctx->stats_in_end;
@@ -3779,7 +3964,7 @@ void av1_init_second_pass(AV1_COMP *cpi) {
 }
 
 void av1_init_single_pass_lap(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
@@ -3813,7 +3998,7 @@ void av1_init_single_pass_lap(AV1_COMP *cpi) {
 #define MINQ_ADJ_LIMIT_CQ 20
 #define HIGH_UNDERSHOOT_RATIO 2
 void av1_twopass_postencode_update(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
   const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
 
@@ -3840,7 +4025,8 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
 
   // Update the active best quality pyramid.
   if (!rc->is_src_frame_alt_ref) {
-    const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_group.index];
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
     int i;
     for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
       rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
@@ -3871,9 +4057,9 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
                 (double)twopass->rolling_arf_group_target_bits,
             twopass->bpm_factor,
             av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
-                                    cm->seq_params.bit_depth),
+                                    cm->seq_params->bit_depth),
             av1_convert_qindex_to_q(rc->active_worst_quality,
-                                    cm->seq_params.bit_depth));
+                                    cm->seq_params->bit_depth));
     fclose(fpfile);
   }
 #endif
diff --git a/third_party/libaom/source/libaom/av1/encoder/pickcdef.c b/third_party/libaom/source/libaom/av1/encoder/pickcdef.c
index 55e466d601..f9758343dc 100644
--- a/third_party/libaom/source/libaom/av1/encoder/pickcdef.c
+++ b/third_party/libaom/source/libaom/av1/encoder/pickcdef.c
@@ -454,13 +454,13 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
       (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   cdef_search_ctx->nhfb =
       (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
   cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
   cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
   cdef_search_ctx->num_planes = num_planes;
   cdef_search_ctx->pick_method = pick_method;
   cdef_search_ctx->sb_count = 0;
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
                        num_planes);
   // Initialize plane wise information.
   for (int pli = 0; pli < num_planes; pli++) {
@@ -478,7 +478,7 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
   }
   // Function pointer initialization.
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
     cdef_search_ctx->copy_fn = copy_sb16_16_highbd;
     cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
   } else {
@@ -491,13 +491,20 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
 #endif
 }
 
-static void pick_cdef_from_qp(AV1_COMMON *const cm) {
-  const int bd = cm->seq_params.bit_depth;
+static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+                              int frames_since_key) {
+  const int bd = cm->seq_params->bit_depth;
   const int q =
       av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
   CdefInfo *const cdef_info = &cm->cdef_info;
-  cdef_info->cdef_bits = 0;
-  cdef_info->nb_cdef_strengths = 1;
+  // Check the speed feature to avoid extra signaling.
+  if (skip_cdef) {
+    cdef_info->cdef_bits = 1;
+    cdef_info->nb_cdef_strengths = 2;
+  } else {
+    cdef_info->cdef_bits = 0;
+    cdef_info->nb_cdef_strengths = 1;
+  }
   cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
 
   int predicted_y_f1 = 0;
@@ -537,13 +544,22 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm) {
   cdef_info->cdef_uv_strengths[0] =
       predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
 
+  if (skip_cdef) {
+    cdef_info->cdef_strengths[1] = 0;
+    cdef_info->cdef_uv_strengths[1] = 0;
+  }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
   for (int r = 0; r < nvfb; ++r) {
     for (int c = 0; c < nhfb; ++c) {
-      mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0;
+      MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
+      current_mbmi->cdef_strength = 0;
+      if (skip_cdef && current_mbmi->skip_cdef_curr_sb &&
+          frames_since_key > 10) {
+        current_mbmi->cdef_strength = 1;
+      }
     }
     mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
   }
@@ -551,10 +567,10 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm) {
 
 void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
-                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method,
-                     int rdmult) {
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
+                     int skip_cdef_feature, int frames_since_key) {
   if (pick_method == CDEF_PICK_FROM_Q) {
-    pick_cdef_from_qp(cm);
+    pick_cdef_from_qp(cm, skip_cdef_feature, frames_since_key);
     return;
   }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
diff --git a/third_party/libaom/source/libaom/av1/encoder/pickcdef.h b/third_party/libaom/source/libaom/av1/encoder/pickcdef.h
index 7fe1edb695..6bea1b0945 100644
--- a/third_party/libaom/source/libaom/av1/encoder/pickcdef.h
+++ b/third_party/libaom/source/libaom/av1/encoder/pickcdef.h
@@ -58,20 +58,6 @@ typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
                                         BLOCK_SIZE bsize, int coeff_shift,
                                         int row, int col);
 
-// Data related to CDEF search multi-thread synchronization.
-typedef struct AV1CdefSyncData {
-#if CONFIG_MULTITHREAD
-  // Mutex lock used while dispatching jobs.
-  pthread_mutex_t *mutex_;
-#endif  // CONFIG_MULTITHREAD
-  // Flag to indicate all blocks are processed and end of frame is reached
-  int end_of_frame;
-  // Row index in units of 64x64 block
-  int fbr;
-  // Column index in units of 64x64 block
-  int fbc;
-} AV1CdefSync;
-
 /*! \brief CDEF search context.
  */
 typedef struct {
@@ -224,6 +210,8 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
  * \param[in]      xd           Pointer to common current coding block structure
  * \param[in]      pick_method  The method used to select params
  * \param[in]      rdmult       rd multiplier to use in making param choices
+ * \param[in]      skip_cdef_feature Speed feature to skip cdef
+ * \param[in]      frames_since_key Number of frames since key frame
  *
  * \return Nothing is returned. Instead, optimal CDEF parameters are stored
  * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
@@ -239,7 +227,8 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
 void av1_cdef_search(struct MultiThreadInfo *mt_info,
                      const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
-                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult);
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
+                     int skip_cdef_feature, int frames_since_key);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/libaom/source/libaom/av1/encoder/picklpf.c b/third_party/libaom/source/libaom/av1/encoder/picklpf.c
index 9b3924f5ce..44030767b5 100644
--- a/third_party/libaom/source/libaom/av1/encoder/picklpf.c
+++ b/third_party/libaom/source/libaom/av1/encoder/picklpf.c
@@ -39,8 +39,8 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (is_stat_consumption_stage_twopass(cpi)) {
-    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
-                                                 : MAX_LOOP_FILTER;
+    return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                      : MAX_LOOP_FILTER;
   } else {
     return MAX_LOOP_FILTER;
   }
@@ -78,16 +78,16 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                              0,
 #endif
                              mt_info->workers, num_workers,
-                             &mt_info->lf_row_sync);
+                             &mt_info->lf_row_sync, 0);
   else
     av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
 #if CONFIG_LPF_MASK
                           0,
 #endif
-                          plane, plane + 1, partial_frame);
+                          plane, plane + 1, partial_frame, 0);
 
   filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
-                               cm->seq_params.use_highbitdepth);
+                               cm->seq_params->use_highbitdepth);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
@@ -153,8 +153,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if ((is_stat_consumption_stage_twopass(cpi)) &&
-        (cpi->twopass.section_intra_rating < 20))
-      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+        (cpi->ppi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20;
 
     // yx, bias less for large block size
     if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
@@ -205,7 +205,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
 
   if (best_cost_ret)
     *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        x->rdmult, 0, (best_err << 4), cm->seq_params.bit_depth);
+        x->rdmult, 0, (best_err << 4), cm->seq_params->bit_depth);
   return filt_best;
 }
 
@@ -226,7 +226,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
     const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
-                                   cm->seq_params.bit_depth);
+                                   cm->seq_params->bit_depth);
     // based on tests result for rtc test set
     // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
     const int strength_boost_q_treshold = 0;
@@ -244,7 +244,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // And high bit depth separately:
     // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
-    switch (cm->seq_params.bit_depth) {
+    switch (cm->seq_params->bit_depth) {
       case AOM_BITS_8:
         filt_guess =
             (cm->current_frame.frame_type == KEY_FRAME)
@@ -263,7 +263,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                "or AOM_BITS_12");
         return;
     }
-    if (cm->seq_params.bit_depth != AOM_BITS_8 &&
+    if (cm->seq_params->bit_depth != AOM_BITS_8 &&
         cm->current_frame.frame_type == KEY_FRAME)
       filt_guess -= 4;
     // TODO(chengchen): retrain the model for Y, U, V filter levels
@@ -272,10 +272,20 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
     lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-    const int last_frame_filter_level[4] = { lf->filter_level[0],
-                                             lf->filter_level[1],
-                                             lf->filter_level_u,
-                                             lf->filter_level_v };
+    int last_frame_filter_level[4] = { 0 };
+    if (!frame_is_intra_only(cm)) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      last_frame_filter_level[0] = cpi->ppi->filter_level[0];
+      last_frame_filter_level[1] = cpi->ppi->filter_level[1];
+      last_frame_filter_level[2] = cpi->ppi->filter_level_u;
+      last_frame_filter_level[3] = cpi->ppi->filter_level_v;
+#else
+      last_frame_filter_level[0] = lf->filter_level[0];
+      last_frame_filter_level[1] = lf->filter_level[1];
+      last_frame_filter_level[2] = lf->filter_level_u;
+      last_frame_filter_level[3] = lf->filter_level_v;
+#endif
+    }
 
     lf->filter_level[0] = lf->filter_level[1] =
         search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
@@ -297,5 +307,14 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
                               last_frame_filter_level, NULL, 2, 0);
     }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Store current frame loopfilter levels if update flag is set.
+    if (cpi->do_frame_data_update) {
+      cpi->ppi->filter_level[0] = lf->filter_level[0];
+      cpi->ppi->filter_level[1] = lf->filter_level[1];
+      cpi->ppi->filter_level_u = lf->filter_level_u;
+      cpi->ppi->filter_level_v = lf->filter_level_v;
+    }
+#endif
   }
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/pickrst.c b/third_party/libaom/source/libaom/av1/encoder/pickrst.c
index 21965138be..2c12cb014f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/pickrst.c
+++ b/third_party/libaom/source/libaom/av1/encoder/pickrst.c
@@ -199,8 +199,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
   const int is_uv = plane > 0;
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationLineBuffers rlbs;
-  const int bit_depth = cm->seq_params.bit_depth;
-  const int highbd = cm->seq_params.use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
 
   const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
   // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
@@ -209,8 +209,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
 
   av1_loop_restoration_filter_unit(
       limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
-      is_uv && cm->seq_params.subsampling_x,
-      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
+      is_uv && cm->seq_params->subsampling_x,
+      is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
       fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
       rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
 
@@ -886,8 +886,8 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
 
   const MACROBLOCK *const x = rsc->x;
   const AV1_COMMON *const cm = rsc->cm;
-  const int highbd = cm->seq_params.use_highbitdepth;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
   const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
   // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
@@ -905,8 +905,8 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
       rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
 
   const int is_uv = rsc->plane > 0;
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
   const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
 
@@ -1474,12 +1474,12 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
     const int scale[3] = { 0, 1, 2 };
     // Obtain the normalized Qscale
     const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
-                                    rsc->cm->seq_params.bit_depth) >>
+                                    rsc->cm->seq_params->bit_depth) >>
                    3;
     // Derive threshold as sqr(normalized Qscale) * scale / 16,
     const uint64_t thresh =
         (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
-    const int highbd = rsc->cm->seq_params.use_highbitdepth;
+    const int highbd = rsc->cm->seq_params->use_highbitdepth;
     const uint64_t src_var =
         var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
     // Do not perform Wiener search if source variance is lower than threshold
@@ -1510,11 +1510,11 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
 
 #if CONFIG_AV1_HIGHBITDEPTH
   const AV1_COMMON *const cm = rsc->cm;
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
     av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
                              rsc->src_buffer, limits->h_start, limits->h_end,
                              limits->v_start, limits->v_end, rsc->dgd_stride,
-                             rsc->src_stride, M, H, cm->seq_params.bit_depth);
+                             rsc->src_stride, M, H, cm->seq_params->bit_depth);
   } else {
     av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                       limits->h_start, limits->h_end, limits->v_start,
@@ -1567,10 +1567,10 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
 
   double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
       x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE],
-      rsc->cm->seq_params.bit_depth);
+      rsc->cm->seq_params->bit_depth);
   double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
       x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER],
-      rsc->cm->seq_params.bit_depth);
+      rsc->cm->seq_params->bit_depth);
 
   RestorationType rtype =
       (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
@@ -1601,7 +1601,7 @@ static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  const int highbd = rsc->cm->seq_params.use_highbitdepth;
+  const int highbd = rsc->cm->seq_params->use_highbitdepth;
   rusi->sse[RESTORE_NONE] = sse_restoration_unit(
       limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
 
@@ -1653,8 +1653,8 @@ static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
     }
     const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
     const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
-    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(x->rdmult, bits >> 4, sse,
-                                                 rsc->cm->seq_params.bit_depth);
+    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth);
     if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
       cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
     if (r == 0 || cost < best_cost) {
@@ -1694,7 +1694,7 @@ static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
   av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
                                  &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
   return RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params.bit_depth);
+      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params->bit_depth);
 }
 
 static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
@@ -1740,7 +1740,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
     double best_cost = 0;
     RestorationType best_rtype = RESTORE_NONE;
 
-    const int highbd = rsc.cm->seq_params.use_highbitdepth;
+    const int highbd = rsc.cm->seq_params->use_highbitdepth;
     if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) {
       av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
                        rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
diff --git a/third_party/libaom/source/libaom/av1/encoder/ratectrl.c b/third_party/libaom/source/libaom/av1/encoder/ratectrl.c
index 33befa6147..c24c822b9b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/ratectrl.c
+++ b/third_party/libaom/source/libaom/av1/encoder/ratectrl.c
@@ -233,11 +233,12 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
         LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
     LAYER_CONTEXT *lc = &svc->layer_context[layer];
     RATE_CONTROL *lrc = &lc->rc;
+    PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
     lrc->bits_off_target +=
         (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
-        AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+        AOMMIN(lrc->bits_off_target, lp_rc->maximum_buffer_size);
     lrc->buffer_level = lrc->bits_off_target;
   }
 }
@@ -245,6 +246,7 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
 static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame)
@@ -253,10 +255,11 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
     rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, p_rc->maximum_buffer_size);
   rc->buffer_level = rc->bits_off_target;
 
-  if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+  if (cpi->ppi->use_svc)
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
 }
 
 int av1_rc_get_default_min_gf_interval(int width, int height,
@@ -285,7 +288,24 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
   return AOMMAX(interval, min_gf_interval);
 }
 
-void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+void av1_primary_rc_init(const AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc) {
+  int min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  int max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+  if (min_gf_interval == 0)
+    min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
+  if (max_gf_interval == 0)
+    max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->input_cfg.init_framerate, min_gf_interval);
+  p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2;
+  p_rc->this_key_frame_forced = 0;
+  p_rc->next_key_frame_forced = 0;
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc,
+                 const PRIMARY_RATE_CONTROL *const p_rc) {
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   int i;
 
@@ -302,8 +322,8 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
   rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
 
-  rc->buffer_level = rc->starting_buffer_level;
-  rc->bits_off_target = rc->starting_buffer_level;
+  rc->buffer_level = p_rc->starting_buffer_level;
+  rc->bits_off_target = p_rc->starting_buffer_level;
 
   rc->rolling_target_bits = rc->avg_frame_bandwidth;
   rc->rolling_actual_bits = rc->avg_frame_bandwidth;
@@ -312,8 +332,6 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->total_target_bits = 0;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
-  rc->this_key_frame_forced = 0;
-  rc->next_key_frame_forced = 0;
 
   rc->frames_till_gf_update_due = 0;
   rc->ni_av_qi = rc_cfg->worst_allowed_q;
@@ -337,7 +355,6 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   if (rc->max_gf_interval == 0)
     rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
         oxcf->input_cfg.init_framerate, rc->min_gf_interval);
-  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
   rc->avg_frame_low_motion = 0;
 
   rc->resize_state = ORIG;
@@ -349,6 +366,7 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
 int av1_rc_drop_frame(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
 
   if (!oxcf->rc_cfg.drop_frames_water_mark) {
     return 0;
@@ -360,7 +378,7 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
-                            rc->optimal_buffer_level / 100);
+                            p_rc->optimal_buffer_level / 100);
       if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
       } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
@@ -384,6 +402,7 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
 
 static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const int max_delta = 16;
@@ -397,7 +416,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
       (cm->width != cm->prev_frame->width ||
        cm->height != cm->prev_frame->height || change_avg_frame_bandwidth);
   // Apply some control/clamp to QP under certain conditions.
-  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc &&
+  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->ppi->use_svc &&
       rc->frames_since_key > 1 && !change_target_bits_mb &&
       (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
        !(refresh_frame_flags->alt_ref_frame ||
@@ -411,7 +430,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
     // Adjust Q base on source content change from scene detection.
     if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
         rc->frames_since_key > 10) {
-      const int bit_depth = cm->seq_params.bit_depth;
+      const int bit_depth = cm->seq_params->bit_depth;
       double delta =
           (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
       // Push Q downwards if content change is decreasing and buffer level
@@ -419,14 +438,14 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
       // only for high Q to avoid excess overshoot.
       // Else reduce decrease in Q from previous frame if content change is
       // increasing and buffer is below max (so not undershooting).
-      if (delta < 0.0 && rc->buffer_level > (rc->optimal_buffer_level >> 2) &&
+      if (delta < 0.0 && rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
           q > (rc->worst_quality >> 1)) {
         double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
         double q_val = av1_convert_qindex_to_q(q, bit_depth);
         q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
       } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
-                 rc->buffer_level < AOMMIN(rc->maximum_buffer_size,
-                                           rc->optimal_buffer_level << 1)) {
+                 rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size,
+                                           p_rc->optimal_buffer_level << 1)) {
         q = (3 * q + rc->q_1_frame) >> 2;
       }
     }
@@ -452,8 +471,9 @@ static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
   GF_ARF_LOW,    // INTNL_ARF_UPDATE
 };
 
-static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) {
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group,
+                                               int gf_frame_index) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
   assert(update_type < FRAME_UPDATE_TYPES);
   return rate_factor_levels[update_type];
 }
@@ -480,12 +500,13 @@ static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     rcf = rc->rate_correction_factors[KF_STD];
   } else if (is_stat_consumption_stage(cpi)) {
-    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
     rcf = rc->rate_correction_factors[rf_lvl];
   } else {
     if ((refresh_frame_flags->alt_ref_frame ||
          refresh_frame_flags->golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
         (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
          cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
       rcf = rc->rate_correction_factors[GF_ARF_STD];
@@ -524,12 +545,13 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (is_stat_consumption_stage(cpi)) {
-    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
     rc->rate_correction_factors[rf_lvl] = factor;
   } else {
     if ((refresh_frame_flags->alt_ref_frame ||
          refresh_frame_flags->golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
         (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
          cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
@@ -564,7 +586,7 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
   } else {
     projected_size_based_on_q = av1_estimate_bits_at_q(
         cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
-        rate_correction_factor, cm->seq_params.bit_depth,
+        rate_correction_factor, cm->seq_params->bit_depth,
         cpi->is_screen_content_type);
   }
   // Work out a size correction factor.
@@ -620,7 +642,7 @@ static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
   return use_cyclic_refresh
              ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
              : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
-                                  correction_factor, cm->seq_params.bit_depth,
+                                  correction_factor, cm->seq_params->bit_depth,
                                   cpi->is_screen_content_type);
 }
 
@@ -724,26 +746,31 @@ static int get_active_quality(int q, int gfu_boost, int low, int high,
   }
 }
 
-static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
                                  aom_bit_depth_t bit_depth) {
   int *kf_low_motion_minq;
   int *kf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
-  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+  return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
-                                 aom_bit_depth_t bit_depth) {
+static int get_gf_active_quality_no_rc(int gfu_boost, int q,
+                                       aom_bit_depth_t bit_depth) {
   int *arfgf_low_motion_minq;
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
-  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+  return get_active_quality(q, gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
+static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth);
+}
+
 static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
@@ -782,8 +809,9 @@ static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
   // (at buffer = critical level).
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t critical_level = p_rc->optimal_buffer_level >> 3;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
@@ -799,25 +827,26 @@ static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
                             rc->avg_frame_qindex[KEY_FRAME])
                    : rc->avg_frame_qindex[INTER_FRAME];
   active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
-  if (rc->buffer_level > rc->optimal_buffer_level) {
+  if (rc->buffer_level > p_rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
-                       max_adjustment_down);
+      buff_lvl_step =
+          ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) /
+           max_adjustment_down);
       if (buff_lvl_step)
-        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+        adjustment = (int)((rc->buffer_level - p_rc->optimal_buffer_level) /
                            buff_lvl_step);
       active_worst_quality -= adjustment;
     }
   } else if (rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      buff_lvl_step = (p_rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
         adjustment = (int)((rc->worst_quality - ambient_qp) *
-                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           (p_rc->optimal_buffer_level - rc->buffer_level) /
                            buff_lvl_step);
       }
       active_worst_quality = ambient_qp + adjustment;
@@ -835,10 +864,11 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
                                                  int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const CurrentFrame *const current_frame = &cm->current_frame;
   int *rtc_minq;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   int active_best_quality = rc->best_quality;
   ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
 
@@ -846,7 +876,7 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
+    if (p_rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
@@ -856,8 +886,8 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
-      active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+      active_best_quality = get_kf_active_quality(
+          p_rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
@@ -868,7 +898,7 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+  } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
              cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
              (refresh_frame_flags->golden_frame ||
               refresh_frame_flags->alt_ref_frame)) {
@@ -880,7 +910,7 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
       q = rc->avg_frame_qindex[INTER_FRAME];
     }
-    active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+    active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     FRAME_TYPE frame_type =
@@ -913,9 +943,10 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
                                              int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   int q;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
   int active_best_quality = calc_active_best_quality_no_stats_cbr(
       cpi, active_worst_quality, width, height);
@@ -932,7 +963,7 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
   *bottom_index = active_best_quality;
 
   // Limit Q range for the adaptive loop.
-  if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+  if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
       current_frame->frame_number != 0) {
     int qdelta = 0;
     aom_clear_system_state();
@@ -944,7 +975,7 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
   }
 
   // Special case code to try and match quality with forced key frames
-  if (current_frame->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+  if (current_frame->frame_type == KEY_FRAME && p_rc->this_key_frame_forced) {
     q = rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -1018,7 +1049,7 @@ static int get_active_cq_level(const RATE_CONTROL *rc,
  *                          \c oxcf->cq_level, or slightly modified for some
  *                          special cases)
  * \param[in]   bit_depth   Bit depth of the codec (same as
- *                          \c cm->seq_params.bit_depth)
+ *                          \c cm->seq_params->bit_depth)
  * \return Returns selected q index to be used for encoding this frame.
  */
 static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
@@ -1037,13 +1068,16 @@ static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
       return cq_level;
     }
     offset_idx = 0;
-  } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) {
-    offset_idx = 1;
-  } else if (update_type == INTNL_ARF_UPDATE) {
-    offset_idx =
-        AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1);
-  } else {  // Leaf level / overlay frame.
-    assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE ||
+  } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+             update_type == INTNL_ARF_UPDATE || update_type == LF_UPDATE) {
+    if (gf_group->layer_depth[gf_index] >=
+        gf_group->max_layer_depth_allowed + 1) {  // Leaf.
+      return cq_level;  // Directly Return worst quality allowed.
+    }
+    offset_idx = AOMMIN(gf_group->layer_depth[gf_index],
+                        gf_group->max_layer_depth_allowed);
+  } else {  // Overlay frame.
+    assert(update_type == OVERLAY_UPDATE ||
            update_type == INTNL_OVERLAY_UPDATE);
     return cq_level;  // Directly Return worst quality allowed.
   }
@@ -1081,10 +1115,11 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
                                          int *bottom_index, int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
 
   assert(has_no_stats_stage(cpi));
@@ -1097,7 +1132,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
   const int cq_level =
       get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
                           cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
   if (oxcf->q_cfg.use_fixed_qp_offsets) {
     return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_index, cq_level,
@@ -1117,7 +1152,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
       const int delta_qindex =
           av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-    } else if (rc->this_key_frame_forced) {
+    } else if (p_rc->this_key_frame_forced) {
       const int qindex = rc->last_boosted_qindex;
       const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex = av1_compute_qdelta(
@@ -1126,8 +1161,8 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
     } else {  // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
 
-      active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+      active_best_quality = get_kf_active_quality(
+          p_rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
@@ -1148,14 +1183,29 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
+    int avg_frame_qindex_inter_frame;
+    int avg_frame_qindex_key_frame;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    avg_frame_qindex_inter_frame =
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+            ? cpi->ppi->temp_avg_frame_qindex[INTER_FRAME]
+            : cpi->rc.avg_frame_qindex[INTER_FRAME];
+    avg_frame_qindex_key_frame =
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+            ? cpi->ppi->temp_avg_frame_qindex[KEY_FRAME]
+            : cpi->rc.avg_frame_qindex[KEY_FRAME];
+#else
+    avg_frame_qindex_inter_frame = rc->avg_frame_qindex[INTER_FRAME];
+    avg_frame_qindex_key_frame = rc->avg_frame_qindex[KEY_FRAME];
+#endif
     q = (rc->frames_since_key > 1 &&
-         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
-            ? rc->avg_frame_qindex[INTER_FRAME]
-            : rc->avg_frame_qindex[KEY_FRAME];
+         avg_frame_qindex_inter_frame < active_worst_quality)
+            ? avg_frame_qindex_inter_frame
+            : avg_frame_qindex_key_frame;
     // For constrained quality dont allow Q less than the cq level
     if (rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
     } else if (rc_mode == AOM_Q) {
@@ -1167,7 +1217,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
               : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
     }
   } else {
     if (rc_mode == AOM_Q) {
@@ -1206,8 +1256,8 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
   {
     int qdelta = 0;
     aom_clear_system_state();
-    if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-        current_frame->frame_number != 0) {
+    if (current_frame->frame_type == KEY_FRAME &&
+        !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
       qdelta = av1_compute_qdelta_by_rate(
           &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0,
           cpi->is_screen_content_type, bit_depth);
@@ -1226,7 +1276,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames
   } else if ((current_frame->frame_type == KEY_FRAME) &&
-             rc->this_key_frame_forced) {
+             p_rc->this_key_frame_forced) {
     q = rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -1251,16 +1301,17 @@ static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
                                                              1.50, 1.25, 1.15,
                                                              1.0 };
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(gf_group);
-  const FRAME_TYPE frame_type = gf_group->frame_type[gf_group->index];
-  const int arf_layer = AOMMIN(gf_group->layer_depth[gf_group->index], 6);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const RATE_FACTOR_LEVEL rf_lvl =
+      get_rate_factor_level(gf_group, cpi->gf_frame_index);
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
   const double rate_factor =
       (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
 
   return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
                                     cpi->is_screen_content_type,
-                                    cpi->common.seq_params.bit_depth);
+                                    cpi->common.seq_params->bit_depth);
 }
 
 // This unrestricted Q selection on CQ mode is useful when testing new features,
@@ -1275,7 +1326,7 @@ static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width,
   const int cq_level =
       get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
                           cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
   (void)width;
   (void)height;
@@ -1295,10 +1346,11 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
                                    int cq_level, int is_fwd_kf) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int active_best_quality;
   int active_worst_quality = *active_worst;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
   if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
     // If the next frame is also a key frame or the current frame is the
@@ -1315,7 +1367,7 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
     const int delta_qindex = av1_compute_qdelta(
         rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
     active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-  } else if (rc->this_key_frame_forced) {
+  } else if (p_rc->this_key_frame_forced) {
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
@@ -1324,8 +1376,8 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
     int qindex;
 
     if (is_stat_consumption_stage_twopass(cpi) &&
-        cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = AOMMIN(p_rc->last_kf_qindex, rc->last_boosted_qindex);
       active_best_quality = qindex;
       last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
@@ -1346,13 +1398,13 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
 
     // Baseline value derived from cpi->active_worst_quality and kf boost.
     active_best_quality =
-        get_kf_active_quality(rc, active_worst_quality, bit_depth);
+        get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
     if (cpi->is_screen_content_type) {
       active_best_quality /= 2;
     }
 
     if (is_stat_consumption_stage_twopass(cpi) &&
-        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+        cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
       active_best_quality /= 3;
     }
 
@@ -1363,7 +1415,8 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
 
     // Make a further adjustment based on the kf zero motion measure.
     if (is_stat_consumption_stage_twopass(cpi))
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+      q_adj_factor +=
+          0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct);
 
     // Convert the adjustment factor to a qindex delta
     // on active_best_quality.
@@ -1394,8 +1447,9 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
                                                  int *active_best) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const int bit_depth = cpi->common.seq_params.bit_depth;
+  const int bit_depth = cpi->common.seq_params->bit_depth;
   int active_best_quality = *active_best;
   int active_worst_quality = *active_worst;
   // Extension to max or min Q if undershoot or overshoot is outside
@@ -1406,20 +1460,21 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
          (refresh_frame_flags->golden_frame || is_intrl_arf_boost ||
           refresh_frame_flags->alt_ref_frame))) {
       active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
-      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+          (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
     } else {
       active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
-      active_worst_quality += cpi->twopass.extend_maxq;
+          (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast) /
+          2;
+      active_worst_quality += cpi->ppi->twopass.extend_maxq;
     }
   }
 
   aom_clear_system_state();
 #ifndef STRICT_RC
   // Static forced key frames Q restrictions dealt with elsewhere.
-  if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
-      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+  if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
+      (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
     const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
@@ -1464,18 +1519,19 @@ static int get_q(const AV1_COMP *cpi, const int width, const int height,
                  const int active_best_quality) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
 
   if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
-      (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
-       cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+      (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced &&
+       cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
        rc->frames_to_key > 1)) {
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames.
-  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+  } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) {
     // If static since last kf use better of last boosted and last kf q.
-    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(p_rc->last_kf_qindex, rc->last_boosted_qindex);
     } else {
       q = AOMMIN(rc->last_boosted_qindex,
                  (active_best_quality + active_worst_quality) / 2);
@@ -1504,20 +1560,29 @@ static int get_active_best_quality(const AV1_COMP *const cpi,
                                    const int active_worst_quality,
                                    const int cq_level, const int gf_index) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
   int *inter_minq;
+  int avg_frame_qindex_inter_frame;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
   int active_best_quality = 0;
   const int is_intrl_arf_boost =
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
-  const int is_leaf_frame =
-      !(refresh_frame_flags->golden_frame ||
-        refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost);
+  int is_leaf_frame =
+      !(gf_group->update_type[gf_index] == ARF_UPDATE ||
+        gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost);
+
+  // TODO(jingning): Consider to rework this hack that covers issues incurred
+  // in lightfield setting.
+  if (cm->tiles.large_scale) {
+    is_leaf_frame = !(refresh_frame_flags->golden_frame ||
+                      refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost);
+  }
   const int is_overlay_frame = rc->is_src_frame_alt_ref;
 
   if (is_leaf_frame || is_overlay_frame) {
@@ -1532,31 +1597,35 @@ static int get_active_best_quality(const AV1_COMP *const cpi,
     return active_best_quality;
   }
 
-  // TODO(chengchen): can we remove this condition?
-  if (rc_mode == AOM_Q && !refresh_frame_flags->alt_ref_frame &&
-      !refresh_frame_flags->golden_frame && !is_intrl_arf_boost) {
-    return cq_level;
-  }
-
   // Determine active_best_quality for frames that are not leaf or overlay.
   int q = active_worst_quality;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // For quality simulation purpose - for parallel frames use previous
+  // avg_frame_qindex
+  avg_frame_qindex_inter_frame =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? cpi->ppi->temp_avg_frame_qindex[INTER_FRAME]
+          : rc->avg_frame_qindex[INTER_FRAME];
+#else
+  avg_frame_qindex_inter_frame = rc->avg_frame_qindex[INTER_FRAME];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   // Use the lower of active_worst_quality and recent
   // average Q as basis for GF/ARF best Q limit unless last frame was
   // a key frame.
   if (rc->frames_since_key > 1 &&
-      rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-    q = rc->avg_frame_qindex[INTER_FRAME];
+      avg_frame_qindex_inter_frame < active_worst_quality) {
+    q = avg_frame_qindex_inter_frame;
   }
   if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
-  active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
   // Constrained quality use slightly lower active best.
   if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
   const int min_boost = get_gf_high_motion_quality(q, bit_depth);
   const int boost = min_boost - active_best_quality;
-  active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+  active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor);
   if (!is_intrl_arf_boost) return active_best_quality;
 
-  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q;
+  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q;
   int this_height = gf_group_pyramid_level(gf_group, gf_index);
   while (this_height > 1) {
     active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
@@ -1565,6 +1634,87 @@ static int get_active_best_quality(const AV1_COMP *const cpi,
   return active_best_quality;
 }
 
+// Returns the q_index for a single frame in the GOP.
+// This function assumes that rc_mode == AOM_Q mode.
+int av1_q_mode_get_q_index(int base_q_index, const GF_GROUP *gf_group,
+                           const int gf_index, int arf_q) {
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+  int is_leaf_or_overlay_frame =
+      gf_group->update_type[gf_index] == LF_UPDATE ||
+      gf_group->update_type[gf_index] == OVERLAY_UPDATE ||
+      gf_group->update_type[gf_index] == INTNL_OVERLAY_UPDATE;
+
+  if (is_leaf_or_overlay_frame) return base_q_index;
+
+  if (!is_intrl_arf_boost) return arf_q;
+
+  int active_best_quality = arf_q;
+  int active_worst_quality = base_q_index;
+  int this_height = gf_group_pyramid_level(gf_group, gf_index);
+  while (this_height > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --this_height;
+  }
+  return active_best_quality;
+}
+
+// Returns the q_index for the ARF in the GOP.
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        int arf_boost_factor) {
+  int active_best_quality =
+      get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
+  const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  return min_boost - (int)(boost * arf_boost_factor);
+}
+
+static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
+                                       int height, int gf_index,
+                                       int *bottom_index, int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    const int is_fwd_kf = cm->current_frame.frame_type == KEY_FRAME &&
+                          cm->show_frame == 0 && cpi->no_show_fwd_kf;
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level, is_fwd_kf);
+  } else {
+    //  Active best quality limited by previous layer.
+    active_best_quality =
+        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+  }
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  *top_index = AOMMAX(*top_index, rc->best_quality);
+  *top_index = AOMMIN(*top_index, rc->worst_quality);
+
+  *bottom_index = AOMMAX(*bottom_index, rc->best_quality);
+  *bottom_index = AOMMIN(*bottom_index, rc->worst_quality);
+
+  q = active_best_quality;
+
+  q = AOMMAX(q, rc->best_quality);
+  q = AOMMIN(q, rc->worst_quality);
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+  return q;
+}
+
 /*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
  *
  * Handles the the general cases not covered by
@@ -1587,20 +1737,25 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   assert(IMPLIES(has_no_stats_stage(cpi),
                  cpi->oxcf.rc_cfg.mode == AOM_Q &&
                      gf_group->update_type[gf_index] != ARF_UPDATE));
   const int cq_level =
       get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
                           cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
   if (oxcf->q_cfg.use_fixed_qp_offsets) {
-    return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index,
+    return get_q_using_fixed_offsets(oxcf, rc, gf_group, cpi->gf_frame_index,
                                      cq_level, bit_depth);
   }
 
+  if (oxcf->rc_cfg.mode == AOM_Q) {
+    return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index,
+                                       bottom_index, top_index);
+  }
+
   int active_best_quality = 0;
   int active_worst_quality = rc->active_worst_quality;
   int q;
@@ -1620,8 +1775,7 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
     //  Active best quality limited by previous layer.
     const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
 
-    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS) ||
-        (oxcf->rc_cfg.mode == AOM_Q)) {
+    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) {
       active_best_quality = get_active_best_quality(cpi, active_worst_quality,
                                                     cq_level, gf_index);
     } else {
@@ -1668,13 +1822,13 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
-                             int height, int gf_index, int *bottom_index,
-                             int *top_index) {
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
   // TODO(sarahparker) merge no-stats vbr and altref q computation
   // with rc_pick_q_and_bounds().
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
        gf_group->update_type[gf_index] == ARF_UPDATE) &&
       has_no_stats_stage(cpi)) {
@@ -1694,7 +1848,7 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
     q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
                              top_index);
   }
-  if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q;
+  if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q;
 
   return q;
 }
@@ -1756,11 +1910,12 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   const AV1_COMMON *const cm = &cpi->common;
   const CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
 
   const int is_intrnl_arf =
-      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+      gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
 
   const int qindex = cm->quant_params.base_qindex;
 
@@ -1776,7 +1931,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
-    if ((cpi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
+    if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
         (!rc->is_src_frame_alt_ref &&
          !(refresh_frame_flags->golden_frame || is_intrnl_arf ||
            refresh_frame_flags->alt_ref_frame))) {
@@ -1784,7 +1939,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
       rc->avg_frame_qindex[INTER_FRAME] =
           ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
       rc->ni_frames++;
-      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
+      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth);
       rc->avg_q = rc->tot_q / rc->ni_frames;
       // Calculate the average Q for normal inter frames (not key or GFU
       // frames).
@@ -1792,7 +1947,23 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
       rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
     }
   }
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /* TODO(FPMT): The current update is happening in cpi->rc.avg_frame_qindex,
+   * this need to be taken care appropriately in final FPMT implementation
+   * to carry these values to subsequent frames. The avg_frame_qindex update
+   * is accumulated across frames, so the values from all individual parallel
+   * frames need to be taken into account after all the parallel frames are
+   * encoded.
+   *
+   * The variable temp_avg_frame_qindex introduced only for quality simulation
+   * purpose, it retains the value previous to the parallel encode frames. The
+   * variable is updated based on the update flag.
+   */
+  if (cpi->do_frame_data_update && !rc->is_src_frame_alt_ref) {
+    for (int index = 0; index < FRAME_TYPES; index++)
+      cpi->ppi->temp_avg_frame_qindex[index] = rc->avg_frame_qindex[index];
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   // Keep record of last boosted (KF/GF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
@@ -1800,12 +1971,12 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) ||
       (current_frame->frame_type == KEY_FRAME) ||
-      (!rc->constrained_gf_group &&
+      (!p_rc->constrained_gf_group &&
        (refresh_frame_flags->alt_ref_frame || is_intrnl_arf ||
         (refresh_frame_flags->golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
-  if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+  if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
 
   update_buffer_level(cpi, rc->projected_frame_size);
   rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
@@ -1853,6 +2024,7 @@ void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
   cpi->rc.frames_to_key--;
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
+  cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
 }
 
 int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
@@ -1954,7 +2126,7 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
      * The no.of.stats available in the case of LAP is limited,
      * hence setting to max_gf_interval.
      */
-    if (cpi->lap_enabled)
+    if (cpi->ppi->lap_enabled)
       rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
     else
       rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
@@ -2003,8 +2175,8 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
   const int stats_count =
-      cpi->twopass.stats_buf_ctx->total_stats != NULL
-          ? (int)cpi->twopass.stats_buf_ctx->total_stats->count
+      cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
+          ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
           : 0;
   const int frame_window = AOMMIN(
       16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
@@ -2048,16 +2220,17 @@ int av1_calc_pframe_target_size_one_pass_vbr(
     const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
   static const int af_ratio = 10;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int64_t target;
 #if USE_ALTREF_FOR_ONE_PASS
   if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
       frame_update_type == ARF_UPDATE) {
-    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
               af_ratio) /
-             (rc->baseline_gf_interval + af_ratio - 1);
+             (p_rc->baseline_gf_interval + af_ratio - 1);
   } else {
-    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
-             (rc->baseline_gf_interval + af_ratio - 1);
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
+             (p_rc->baseline_gf_interval + af_ratio - 1);
   }
   if (target > INT_MAX) target = INT_MAX;
 #else
@@ -2077,9 +2250,10 @@ int av1_calc_pframe_target_size_one_pass_cbr(
     const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
   const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
-  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
-  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  const int64_t diff = p_rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
   int min_frame_target =
       AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target;
@@ -2087,17 +2261,17 @@ int av1_calc_pframe_target_size_one_pass_cbr(
   if (rc_cfg->gf_cbr_boost_pct) {
     const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
     if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
-      target =
-          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
-          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+                af_ratio_pct) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     } else {
-      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
-               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     }
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (cpi->use_svc) {
+  if (cpi->ppi->use_svc) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -2129,11 +2303,12 @@ int av1_calc_pframe_target_size_one_pass_cbr(
 
 int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
   int target;
   if (cpi->common.current_frame.frame_number == 0) {
-    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+    target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
                  ? INT_MAX
-                 : (int)(rc->starting_buffer_level / 2);
+                 : (int)(p_rc->starting_buffer_level / 2);
   } else {
     int kf_boost = 32;
     double framerate = cpi->framerate;
@@ -2177,7 +2352,7 @@ void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
   int gld_idx = 0;
   int alt_ref_idx = 0;
   ext_refresh_frame_flags->update_pending = 1;
-  svc->external_ref_frame_config = 1;
+  svc->set_ref_frame_config = 1;
   ext_flags->ref_frame_flags = 0;
   ext_refresh_frame_flags->last_frame = 1;
   ext_refresh_frame_flags->golden_frame = 0;
@@ -2268,9 +2443,9 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) {
     int num_samples = 0;
     const int thresh = 6;
     // SAD is computed on 64x64 blocks
-    const int sb_size_by_mb = (cm->seq_params.sb_size == BLOCK_128X128)
-                                  ? (cm->seq_params.mib_size >> 1)
-                                  : cm->seq_params.mib_size;
+    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
     const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
     const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
     uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
@@ -2286,12 +2461,12 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) {
              (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
              ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
               (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
-          tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
-                                           last_src_ystride);
+          tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                                last_src_ystride);
           if (check_light_change) {
             unsigned int sse, variance;
-            variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
-                                             last_src_ystride, &sse);
+            variance = cpi->ppi->fn_ptr[bsize].vf(
+                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
             // Note: sse - variance = ((sum * sum) >> 12)
             // Detect large lighting change.
             if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
@@ -2344,7 +2519,8 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) {
 static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
                                              FRAME_TYPE frame_type) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   ResizePendingParams *const resize_pending_params =
       &cpi->resize_pending_params;
   int gf_update = 0;
@@ -2360,34 +2536,34 @@ static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
     if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
       av1_cyclic_refresh_set_golden_update(cpi);
     else
-      rc->baseline_gf_interval = MAX_GF_INTERVAL;
-    if (rc->baseline_gf_interval > rc->frames_to_key)
-      rc->baseline_gf_interval = rc->frames_to_key;
-    rc->gfu_boost = DEFAULT_GF_BOOST_RT;
-    rc->constrained_gf_group =
-        (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    gf_group->index = 0;
+      p_rc->baseline_gf_interval = MAX_GF_INTERVAL;
+    if (p_rc->baseline_gf_interval > rc->frames_to_key)
+      p_rc->baseline_gf_interval = rc->frames_to_key;
+    p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+    p_rc->constrained_gf_group =
+        (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    cpi->gf_frame_index = 0;
     // SVC does not use GF as periodic boost.
     // TODO(marpan): Find better way to disable this for SVC.
-    if (cpi->use_svc) {
+    if (cpi->ppi->use_svc) {
       SVC *const svc = &cpi->svc;
-      rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
-      rc->gfu_boost = 1;
-      rc->constrained_gf_group = 0;
-      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+      p_rc->gfu_boost = 1;
+      p_rc->constrained_gf_group = 0;
+      rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
       for (int layer = 0;
            layer < svc->number_spatial_layers * svc->number_temporal_layers;
            ++layer) {
         LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        lc->rc.baseline_gf_interval = rc->baseline_gf_interval;
-        lc->rc.gfu_boost = rc->gfu_boost;
-        lc->rc.constrained_gf_group = rc->constrained_gf_group;
+        lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+        lc->p_rc.gfu_boost = p_rc->gfu_boost;
+        lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
         lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
         lc->group_index = 0;
       }
     }
-    gf_group->size = rc->baseline_gf_interval;
+    gf_group->size = p_rc->baseline_gf_interval;
     gf_group->update_type[0] =
         (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
     gf_update = 1;
@@ -2398,6 +2574,7 @@ static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
 static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
                             int prev_width, int prev_height) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SVC *const svc = &cpi->svc;
   double tot_scale_change = 1.0;
   int target_bits_per_frame;
@@ -2406,8 +2583,8 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
   tot_scale_change = (double)(resize_width * resize_height) /
                      (double)(prev_width * prev_height);
   // Reset buffer level to optimal, update target size.
-  rc->buffer_level = rc->optimal_buffer_level;
-  rc->bits_off_target = rc->optimal_buffer_level;
+  rc->buffer_level = p_rc->optimal_buffer_level;
+  rc->bits_off_target = p_rc->optimal_buffer_level;
   rc->this_frame_target =
       av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
   target_bits_per_frame = rc->this_frame_target;
@@ -2431,8 +2608,8 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
                                  svc->number_temporal_layers +
                              tl];
     lc->rc.resize_state = rc->resize_state;
-    lc->rc.buffer_level = lc->rc.optimal_buffer_level;
-    lc->rc.bits_off_target = lc->rc.optimal_buffer_level;
+    lc->rc.buffer_level = lc->p_rc.optimal_buffer_level;
+    lc->rc.bits_off_target = lc->p_rc.optimal_buffer_level;
     lc->rc.rate_correction_factors[INTER_FRAME] =
         rc->rate_correction_factors[INTER_FRAME];
   }
@@ -2464,6 +2641,7 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
 static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   RESIZE_ACTION resize_action = NO_RESIZE;
   const int avg_qp_thr1 = 70;
   const int avg_qp_thr2 = 50;
@@ -2486,7 +2664,7 @@ static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
   if (cpi->rc.frames_since_key > cpi->framerate) {
     const int window = AOMMIN(30, (int)(2 * cpi->framerate));
     rc->resize_avg_qp += rc->last_q[INTER_FRAME];
-    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+    if (cpi->rc.buffer_level < (int)(30 * p_rc->optimal_buffer_level / 100))
       ++rc->resize_buffer_underflow;
     ++rc->resize_count;
     // Check for resize action every "window" frames.
@@ -2548,8 +2726,9 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
                                 EncodeFrameParams *const frame_params,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   SVC *const svc = &cpi->svc;
   ResizePendingParams *const resize_pending_params =
       &cpi->resize_pending_params;
@@ -2559,35 +2738,35 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
                        svc->number_temporal_layers);
   // Turn this on to explicitly set the reference structure rather than
   // relying on internal/default structure.
-  if (cpi->use_svc) {
+  if (cpi->ppi->use_svc) {
     av1_update_temporal_layer_framerate(cpi);
     av1_restore_layer_context(cpi);
   }
   // Set frame type.
-  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
-      (cpi->use_svc && svc->spatial_layer_id == 0 &&
+  if ((!cpi->ppi->use_svc && rc->frames_to_key == 0) ||
+      (cpi->ppi->use_svc && svc->spatial_layer_id == 0 &&
        (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
         svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)) ||
       (frame_flags & FRAMEFLAGS_KEY)) {
     frame_params->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
+    p_rc->this_key_frame_forced =
         cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
-    rc->kf_boost = DEFAULT_KF_BOOST_RT;
-    gf_group->update_type[gf_group->index] = KF_UPDATE;
-    gf_group->frame_type[gf_group->index] = KEY_FRAME;
-    gf_group->refbuf_state[gf_group->index] = REFBUF_RESET;
-    if (cpi->use_svc) {
+    p_rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE;
+    gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET;
+    if (cpi->ppi->use_svc) {
       if (cm->current_frame.frame_number > 0)
         av1_svc_reset_temporal_layers(cpi, 1);
       svc->layer_context[layer].is_key_frame = 1;
     }
   } else {
     frame_params->frame_type = INTER_FRAME;
-    gf_group->update_type[gf_group->index] = LF_UPDATE;
-    gf_group->frame_type[gf_group->index] = INTER_FRAME;
-    gf_group->refbuf_state[gf_group->index] = REFBUF_UPDATE;
-    if (cpi->use_svc) {
+    gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
+    gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
+    if (cpi->ppi->use_svc) {
       LAYER_CONTEXT *lc = &svc->layer_context[layer];
       lc->is_key_frame =
           svc->spatial_layer_id == 0
@@ -2596,7 +2775,7 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
     }
   }
   // Check for scene change, for non-SVC for now.
-  if (!cpi->use_svc && cpi->sf.rt_sf.check_scene_detection)
+  if (!cpi->ppi->use_svc && cpi->sf.rt_sf.check_scene_detection)
     rc_scene_detection_onepass_rt(cpi);
   // Check for dynamic resize, for single spatial layer for now.
   // For temporal layers only check on base temporal layer.
@@ -2628,14 +2807,14 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
       target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
     } else {
       target = av1_calc_pframe_target_size_one_pass_cbr(
-          cpi, gf_group->update_type[gf_group->index]);
+          cpi, gf_group->update_type[cpi->gf_frame_index]);
     }
   } else {
     if (frame_params->frame_type == KEY_FRAME) {
       target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
     } else {
       target = av1_calc_pframe_target_size_one_pass_vbr(
-          cpi, gf_group->update_type[gf_group->index]);
+          cpi, gf_group->update_type[cpi->gf_frame_index]);
     }
   }
   if (cpi->oxcf.rc_cfg.mode == AOM_Q)
@@ -2644,11 +2823,21 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi,
   av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
   rc->base_frame_target = target;
   cm->current_frame.frame_type = frame_params->frame_type;
+  // For fixed mode SVC: if KSVC is enabled remove inter layer
+  // prediction on spatial enhancement layer frames for frames
+  // whose base is not KEY frame.
+  if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode &&
+      svc->number_spatial_layers > 1 &&
+      !svc->layer_context[layer].is_key_frame) {
+    ExternalFlags *const ext_flags = &cpi->ext_flags;
+    ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  }
 }
 
 int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SPEED_FEATURES *const sf = &cpi->sf;
   int thresh_qp = 7 * (rc->worst_quality >> 3);
   // Lower thresh_qp for video (more overshoot at lower Q) to be
@@ -2670,8 +2859,8 @@ int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
     // have settled down to a very different (low QP) state, then not adjusting
     // them may cause next frame to select low QP and overshoot again.
     cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
-    rc->buffer_level = rc->optimal_buffer_level;
-    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->buffer_level = p_rc->optimal_buffer_level;
+    rc->bits_off_target = p_rc->optimal_buffer_level;
     // Reset rate under/over-shoot flags.
     cpi->rc.rc_1_frame = 0;
     cpi->rc.rc_2_frame = 0;
@@ -2680,7 +2869,7 @@ int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
         (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
     // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
     // This comes from the inverse computation of vp9_rc_bits_per_mb().
-    q2 = av1_convert_qindex_to_q(*q, cm->seq_params.bit_depth);
+    q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
     enumerator = 1800000;  // Factor for inter frame.
     enumerator += (int)(enumerator * q2) >> 12;
     new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
diff --git a/third_party/libaom/source/libaom/av1/encoder/ratectrl.h b/third_party/libaom/source/libaom/av1/encoder/ratectrl.h
index 3f1756f5ca..a1567f038c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/ratectrl.h
+++ b/third_party/libaom/source/libaom/av1/encoder/ratectrl.h
@@ -129,11 +129,6 @@ typedef struct {
   int this_frame_target;  // Actual frame target after rc adjustment.
 
   /*!
-   * Target bit budget for the current GF / ARF group of frame.
-   */
-  int64_t gf_group_bits;
-
-  /*!
    * Projected size for current frame
    */
   int projected_frame_size;
@@ -159,20 +154,6 @@ typedef struct {
   int last_boosted_qindex;
 
   /*!
-   * Q used for last boosted (non leaf) frame
-   */
-  int last_kf_qindex;
-
-  /*!
-   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
-   */
-  int gfu_boost;
-  /*!
-   * Boost factor used to calculate the extra bits allocated to the key frame
-   */
-  int kf_boost;
-
-  /*!
    * Correction factors used to adjust the q estimate for a given target rate
    * in the encode loop.
    */
@@ -193,28 +174,10 @@ typedef struct {
    */
   int intervals_till_gf_calculate_due;
 
-  /*!
-   * Stores the determined gf group lengths for a set of gf groups
-   */
-  int gf_intervals[MAX_NUM_GF_INTERVALS];
-
-  /*!
-   * The current group's index into gf_intervals[]
-   */
-  int cur_gf_index;
-
   /*!\cond */
-  int num_regions;
-  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
-  double cor_coeff[MAX_FIRSTPASS_ANALYSIS_FRAMES];
-  int regions_offset;  // offset of regions from the last keyframe
-  int frames_till_regions_update;
-
   int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
-  int baseline_gf_interval;
-  int constrained_gf_group;
   /*!\endcond */
   /*!
    * Frames before the next key frame
@@ -222,8 +185,6 @@ typedef struct {
   int frames_to_key;
   /*!\cond */
   int frames_since_key;
-  int this_key_frame_forced;
-  int next_key_frame_forced;
   int is_src_frame_alt_ref;
   int sframe_due;
 
@@ -269,18 +230,6 @@ typedef struct {
    */
   int best_quality;
 
-  /*!
-   * Initial buffuer level in ms for CBR / low delay encoding
-   */
-  int64_t starting_buffer_level;
-  /*!
-   * Optimum / target buffuer level in ms for CBR / low delay encoding
-   */
-  int64_t optimal_buffer_level;
-  /*!
-   * Maximum target buffuer level in ms for CBR / low delay encoding
-   */
-  int64_t maximum_buffer_size;
   /*!\cond */
 
   // rate control history for last frame(1) and the frame before(2).
@@ -292,14 +241,8 @@ typedef struct {
   int q_1_frame;
   int q_2_frame;
 
-  float_t arf_boost_factor;
-
   /*!\endcond */
   /*!
-   * Q index used for ALT frame
-   */
-  int arf_q;
-  /*!
    * Proposed maximum alloed Q for current frame
    */
   int active_worst_quality;
@@ -309,35 +252,119 @@ typedef struct {
   int active_best_quality[MAX_ARF_LAYERS + 1];
 
   /*!\cond */
+  // Track amount of low motion in scene
+  int avg_frame_low_motion;
+
+  // For dynamic resize, 1 pass cbr.
+  RESIZE_STATE resize_state;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+  /*!\endcond */
+} RATE_CONTROL;
+
+/*!
+ * \brief  Primary Rate Control parameters and status
+ */
+typedef struct {
+  // Sub-gop level Rate targetting variables
+
+  /*!
+   * Target bit budget for the current GF / ARF group of frame.
+   */
+  int64_t gf_group_bits;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to the key frame
+   */
+  int kf_boost;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+   */
+  int gfu_boost;
+
+  /*!
+   * Stores the determined gf group lengths for a set of gf groups
+   */
+  int gf_intervals[MAX_NUM_GF_INTERVALS];
+
+  /*!
+   * The current group's index into gf_intervals[]
+   */
+  int cur_gf_index;
+
+  /*!\cond */
+  int num_regions;
+
+  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  int regions_offset;  // offset of regions from the last keyframe
+  int frames_till_regions_update;
+
+  int baseline_gf_interval;
+
+  int constrained_gf_group;
+
+  int this_key_frame_forced;
+
+  int next_key_frame_forced;
+  /*!\endcond */
+
+  /*!
+   * Initial buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t starting_buffer_level;
+
+  /*!
+   * Optimum / target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t optimal_buffer_level;
+
+  /*!
+   * Maximum target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t maximum_buffer_size;
+
+  /*!
+   * Q index used for ALT frame
+   */
+  int arf_q;
+
+  /*!\cond */
+  float_t arf_boost_factor;
+
   int base_layer_qp;
 
   // Total number of stats used only for kf_boost calculation.
   int num_stats_used_for_kf_boost;
+
   // Total number of stats used only for gfu_boost calculation.
   int num_stats_used_for_gfu_boost;
+
   // Total number of stats required by gfu_boost calculation.
   int num_stats_required_for_gfu_boost;
+
   int next_is_fwd_key;
+
   int enable_scenecut_detection;
-  int use_arf_in_this_kf_group;
-  // Track amount of low motion in scene
-  int avg_frame_low_motion;
 
-  // For dynamic resize, 1 pass cbr.
-  RESIZE_STATE resize_state;
-  int resize_avg_qp;
-  int resize_buffer_underflow;
-  int resize_count;
+  int use_arf_in_this_kf_group;
   /*!\endcond */
-} RATE_CONTROL;
 
-/*!\cond */
+  /*!
+   * Q used for last boosted (non leaf) frame
+   */
+  int last_kf_qindex;
+} PRIMARY_RATE_CONTROL;
 
 struct AV1_COMP;
 struct AV1EncoderConfig;
 
+void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc);
+
 void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
-                 RATE_CONTROL *rc);
+                 RATE_CONTROL *rc, const PRIMARY_RATE_CONTROL *const p_rc);
 
 int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
                            double correction_factor, aom_bit_depth_t bit_depth,
@@ -415,7 +442,6 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
  *
  * \ingroup rate_control
  * \param[in]       cpi          Top level encoder structure
- * \param[in,out]   rc           Top level rate control structure
  * \param[in]       width        Coded frame width
  * \param[in]       height       Coded frame height
  * \param[in]       gf_index     Index of this frame in the golden frame group
@@ -424,9 +450,8 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
  * \return Returns selected q index to be used for encoding this frame.
  * Also, updates \c rc->arf_q.
  */
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc,
-                             int width, int height, int gf_index,
-                             int *bottom_index, int *top_index);
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index);
 
 /*!\brief Estimates q to achieve a target bits per frame
  *
diff --git a/third_party/libaom/source/libaom/av1/encoder/rc_utils.h b/third_party/libaom/source/libaom/av1/encoder/rc_utils.h
index 98cec2e003..0a9d02d17b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/rc_utils.h
+++ b/third_party/libaom/source/libaom/av1/encoder/rc_utils.h
@@ -19,18 +19,45 @@
 extern "C" {
 #endif
 
-static AOM_INLINE void set_rc_buffer_sizes(RATE_CONTROL *rc,
-                                           const RateControlCfg *rc_cfg) {
+static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  if (cpi->common.current_frame.frame_number >
+      (unsigned int)cpi->svc.number_spatial_layers) {
+    if (cpi->ppi->use_svc) {
+      av1_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        rc->bits_off_target = p_rc->optimal_buffer_level;
+        rc->buffer_level = p_rc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_rc_buffer_sizes(AV1_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+
   const int64_t bandwidth = rc_cfg->target_bandwidth;
   const int64_t starting = rc_cfg->starting_buffer_level_ms;
   const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
   const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
 
-  rc->starting_buffer_level = starting * bandwidth / 1000;
-  rc->optimal_buffer_level =
+  p_rc->starting_buffer_level = starting * bandwidth / 1000;
+  p_rc->optimal_buffer_level =
       (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
-  rc->maximum_buffer_size =
+  p_rc->maximum_buffer_size =
       (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, p_rc->maximum_buffer_size);
+  rc->buffer_level = AOMMIN(rc->buffer_level, p_rc->maximum_buffer_size);
 }
 
 static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
@@ -38,7 +65,7 @@ static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
   aom_clear_system_state();
 
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  SequenceHeader *const seq_params = &cpi->common.seq_params;
+  SequenceHeader *const seq_params = cpi->common.seq_params;
   TileConfig *const tile_cfg = &oxcf->tile_cfg;
   RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
@@ -48,11 +75,11 @@ static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
       av1_get_max_bitrate_for_level(target_level, tier, profile);
   const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
   rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
-  // Also need to update cpi->twopass.bits_left.
-  TWO_PASS *const twopass = &cpi->twopass;
+  // Also need to update cpi->ppi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
   if (stats != NULL)
-    cpi->twopass.bits_left =
+    cpi->ppi->twopass.bits_left =
         (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
 
   // Adjust max over-shoot percentage.
@@ -226,6 +253,7 @@ static AOM_INLINE void recode_loop_update_q(
     int *const low_cr_seen, const int loop_count) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
   *loop = 0;
 
@@ -263,14 +291,15 @@ static AOM_INLINE void recode_loop_update_q(
                                    &frame_over_shoot_limit);
   if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
-  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
+  if (cm->current_frame.frame_type == KEY_FRAME &&
+      p_rc->this_key_frame_forced &&
       rc->projected_frame_size < rc->max_frame_bandwidth) {
     int64_t kf_err;
     const int64_t high_err_target = cpi->ambient_err;
     const int64_t low_err_target = cpi->ambient_err >> 1;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth) {
+    if (cm->seq_params->use_highbitdepth) {
       kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
     } else {
       kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
@@ -323,11 +352,11 @@ static AOM_INLINE void recode_loop_update_q(
       if (*q == *q_high &&
           rc->projected_frame_size >= rc->max_frame_bandwidth) {
         const double q_val_high_current =
-            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
+            av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth);
         const double q_val_high_new =
             q_val_high_current *
             ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
-        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth,
                                   rc->best_quality, rc->worst_quality);
       }
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/rd.c b/third_party/libaom/source/libaom/av1/encoder/rd.c
index 389b4bfe3b..e361264f16 100644
--- a/third_party/libaom/source/libaom/av1/encoder/rd.c
+++ b/third_party/libaom/source/libaom/av1/encoder/rd.c
@@ -354,11 +354,45 @@ static const int rd_layer_depth_factor[7] = {
   160, 160, 160, 160, 192, 208, 224
 };
 
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 3.2 + (0.0035 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 3.25 + (0.0035 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 3.3 + (0.0035 * (double)qindex);
+}
+
 int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
-  const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-  int rdmult = (int)(((int64_t)88 * q * q) / 24);
+  const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params->bit_depth);
+  const FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  int rdmult = q * q;
+
+  if (update_type == KF_UPDATE) {
+    double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult);
+  } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult);
+  } else {
+    double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult);
+  }
 
-  switch (cpi->common.seq_params.bit_depth) {
+  switch (cpi->common.seq_params->bit_depth) {
     case AOM_BITS_8: break;
     case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
     case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
@@ -373,9 +407,10 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
   int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex);
   if (is_stat_consumption_stage(cpi) &&
       (cpi->common.current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
-    const int layer_depth = AOMMIN(gf_group->layer_depth[gf_group->index], 6);
+    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+    const int layer_depth =
+        AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
 
     // Layer depth adjustment
     rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
@@ -386,21 +421,30 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
   return (int)rdmult;
 }
 
-int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) {
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
   assert(beta > 0.0);
-  int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+  int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
   int newq = (int)rint(q / sqrt(beta));
   int orig_qindex = qindex;
+  if (newq == q) {
+    return 0;
+  }
   if (newq < q) {
-    do {
+    while (qindex > 0) {
       qindex--;
-      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-    } while (newq < q && qindex > 0);
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq >= q) {
+        break;
+      }
+    }
   } else {
-    do {
+    while (qindex < MAXQ) {
       qindex++;
-      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-    } while (newq > q && qindex < MAXQ);
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq <= q) {
+        break;
+      }
+    }
   }
   return qindex - orig_qindex;
 }
@@ -409,7 +453,7 @@ int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
   assert(beta > 0.0);
   const AV1_COMMON *cm = &cpi->common;
   int q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                           cm->seq_params.bit_depth);
+                           cm->seq_params->bit_depth);
 
   return (int)(av1_compute_rd_mult(cpi, q) / beta);
 }
@@ -433,7 +477,7 @@ static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
 }
 
 void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
-  switch (cpi->common.seq_params.bit_depth) {
+  switch (cpi->common.seq_params->bit_depth) {
     case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break;
     case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break;
     case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break;
@@ -450,7 +494,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
             cm->quant_params.y_dc_delta_q,
         0, MAXQ);
-    const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
@@ -577,6 +621,13 @@ void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
   }
 }
 
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) {
+  dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX];
+  dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX];
+  av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc,
+                           MV_SUBPEL_NONE);
+}
+
 void av1_initialize_rd_consts(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
@@ -610,14 +661,9 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
       cost_upd_freq.mode == COST_UPD_TILE || fill_costs)
     av1_fill_mode_rates(cm, &x->mode_costs, cm->fc);
 
-  if (!use_nonrd_pick_mode && frame_is_intra_only(cm) &&
-      cm->features.allow_screen_content_tools &&
+  if (!use_nonrd_pick_mode && av1_allow_intrabc(cm) &&
       !is_stat_generation_stage(cpi)) {
-    IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
-    int *dvcost[2] = { &dv_costs->mv_component[0][MV_MAX],
-                       &dv_costs->mv_component[1][MV_MAX] };
-    av1_build_nmv_cost_table(dv_costs->joint_mv, dvcost, &cm->fc->ndvc,
-                             MV_SUBPEL_NONE);
+    av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
   }
 }
 
@@ -1016,12 +1062,16 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
     const uint8_t *const ref_y_ptr =
         &ref_y_buffer[ref_y_stride * fp_row + fp_col];
     // Find sad for current vector.
-    const int this_sad = cpi->fn_ptr[block_size].sdf(
+    const int this_sad = cpi->ppi->fn_ptr[block_size].sdf(
         src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
     }
+    if (i == 0)
+      x->pred_mv0_sad[ref_frame] = this_sad;
+    else if (i == 1)
+      x->pred_mv1_sad[ref_frame] = this_sad;
   }
 
   // Note the index of the mv that worked best in the reference list.
@@ -1287,7 +1337,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
   const THR_MODES top_mode = MAX_MODES;
   const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
 
-  const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size;
+  const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size;
   BLOCK_SIZE min_size, max_size;
   if (bsize_is_1_to_4) {
     // This part handles block sizes with 1:4 and 4:1 aspect ratios
@@ -1296,7 +1346,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
     max_size = bsize;
   } else {
     min_size = AOMMAX(bsize - 2, BLOCK_4X4);
-    max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
+    max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size);
   }
 
   for (THR_MODES mode = 0; mode < top_mode; ++mode) {
diff --git a/third_party/libaom/source/libaom/av1/encoder/rd.h b/third_party/libaom/source/libaom/av1/encoder/rd.h
index e37c86b9d5..c1ba819ae2 100644
--- a/third_party/libaom/source/libaom/av1/encoder/rd.h
+++ b/third_party/libaom/source/libaom/av1/encoder/rd.h
@@ -81,20 +81,6 @@ typedef struct RD_OPT {
   double r0;
 } RD_OPT;
 
-typedef struct {
-  // Cost of transmitting the actual motion vector.
-  // mv_component[0][i] is the cost of motion vector with horizontal component
-  // (mv_row) equal to i - MV_MAX.
-  // mv_component[1][i] is the cost of motion vector with vertical component
-  // (mv_col) equal to i - MV_MAX.
-  int mv_component[2][MV_VALS];
-
-  // joint_mv[i] is the cost of transmitting joint mv(MV_JOINT_TYPE) of
-  // type i.
-  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
-  int joint_mv[MV_JOINTS];
-} IntraBCMVCosts;
-
 static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
 #if CONFIG_RD_DEBUG
   int plane;
@@ -110,12 +96,6 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
-    {
-      int r, c;
-      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
-        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
-          rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
-    }
   }
 #endif
 }
@@ -135,19 +115,18 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
-    {
-      int r, c;
-      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
-        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
-          rd_stats->txb_coeff_cost_map[plane][r][c] = INT16_MAX;
-    }
   }
 #endif
 }
 
 static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
                                       const RD_STATS *rd_stats_src) {
-  assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX);
+  if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
+    // If rd_stats_dst or rd_stats_src has invalid rate, we will make
+    // rd_stats_dst invalid.
+    av1_invalid_rd_stats(rd_stats_dst);
+    return;
+  }
   rd_stats_dst->rate = (int)AOMMIN(
       ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
   if (!rd_stats_dst->zero_rate)
@@ -160,18 +139,6 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   // encoded, as there will only be 1 plane
   for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
-    {
-      // TODO(angiebird): optimize this part
-      int r, c;
-      int ref_txb_coeff_cost = 0;
-      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
-        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-          rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
-              rd_stats_src->txb_coeff_cost_map[plane][r][c];
-          ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
-        }
-      assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
-    }
   }
 #endif
 }
@@ -375,9 +342,11 @@ void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
 void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
                        MvCosts *mv_costs);
 
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs);
+
 int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
 
-int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta);
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/libaom/source/libaom/av1/encoder/rdopt.c b/third_party/libaom/source/libaom/av1/encoder/rdopt.c
index 6200ac11dd..3ca0cb4143 100644
--- a/third_party/libaom/source/libaom/av1/encoder/rdopt.c
+++ b/third_party/libaom/source/libaom/av1/encoder/rdopt.c
@@ -627,8 +627,8 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
         get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
     total_sse += sse;
     if (!plane && sse_y) *sse_y = sse;
   }
@@ -1156,13 +1156,16 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
     int_mv best_mv;
     av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
-                             mode_info, &best_mv);
+                             mode_info, &best_mv, args);
     if (best_mv.as_int == INVALID_MV) return INT64_MAX;
 
     args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
     args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
     args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
     cur_mv[0].as_int = best_mv.as_int;
+
+    // Return after single_newmv is set.
+    if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX;
   }
 
   return 0;
@@ -1276,7 +1279,7 @@ static int64_t motion_mode_rd(
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   const int rate_mv0 = *rate_mv;
-  const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+  const int interintra_allowed = cm->seq_params->enable_interintra_compound &&
                                  is_interintra_allowed(mbmi) &&
                                  mbmi->compound_idx;
   WARP_SAMPLE_INFO *const warp_sample_info =
@@ -1319,7 +1322,7 @@ static int64_t motion_mode_rd(
   const int switchable_rate =
       av1_is_interp_needed(xd)
           ? av1_get_switchable_rate(x, xd, interp_filter,
-                                    cm->seq_params.enable_dual_filter)
+                                    cm->seq_params->enable_dual_filter)
           : 0;
   int64_t best_rd = INT64_MAX;
   int best_rate_mv = rate_mv0;
@@ -1355,11 +1358,18 @@ static int64_t motion_mode_rd(
 
     // Do not search OBMC if the probability of selecting it is below a
     // predetermined threshold for this update_type and block size.
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-    const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
-                           cpi->sf.inter_sf.prune_obmc_prob_thresh;
-    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc ||
-         cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) &&
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int obmc_probability;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    obmc_probability =
+        cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize];
+#else
+    obmc_probability = cpi->frame_probs.obmc_probs[update_type][bsize];
+#endif
+    const int prune_obmc =
+        obmc_probability < cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) &&
         mbmi->motion_mode == OBMC_CAUSAL)
       continue;
 
@@ -1373,7 +1383,7 @@ static int64_t motion_mode_rd(
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
         av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
-                                 &mbmi->mv[0]);
+                                 &mbmi->mv[0], NULL);
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
       if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
@@ -1897,10 +1907,11 @@ static bool ref_mv_idx_early_breakout(
 }
 
 // Compute the estimated RD cost for the motion vector with simple translation.
-static int64_t simple_translation_pred_rd(
-    AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
-    HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info,
-    int64_t ref_best_rd, BLOCK_SIZE bsize) {
+static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          RD_STATS *rd_stats,
+                                          HandleInterModeArgs *args,
+                                          int ref_mv_idx, int64_t ref_best_rd,
+                                          BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
@@ -1933,7 +1944,6 @@ static int64_t simple_translation_pred_rd(
   const int drl_cost =
       get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
   rd_stats->rate += drl_cost;
-  mode_info[ref_mv_idx].drl_cost = drl_cost;
 
   int_mv cur_mv[2];
   if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
@@ -1987,8 +1997,8 @@ static INLINE bool mask_check_bit(int mask, int index) {
 static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
                                 RD_STATS *rd_stats,
                                 HandleInterModeArgs *const args,
-                                int64_t ref_best_rd, inter_mode_info *mode_info,
-                                BLOCK_SIZE bsize, const int ref_set) {
+                                int64_t ref_best_rd, BLOCK_SIZE bsize,
+                                const int ref_set) {
   AV1_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -2027,7 +2037,7 @@ static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
       continue;
     }
     idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
-        cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize);
+        cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize);
   }
   // Find the index with the best RD cost.
   int best_idx = 0;
@@ -2171,14 +2181,17 @@ typedef struct {
 static AOM_INLINE void get_block_level_tpl_stats(
     AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
     PruneInfoFromTpl *inter_cost_info_from_tpl) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
   AV1_COMMON *const cm = &cpi->common;
 
-  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
-  const int tpl_idx = gf_group->index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) {
+    return;
+  }
   const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  if (tpl_idx >= MAX_TPL_FRAME_IDX || !tpl_frame->is_valid) {
+  if (!tpl_frame->is_valid) {
     return;
   }
 
@@ -2274,101 +2287,6 @@ static AOM_INLINE int prune_modes_based_on_tpl_stats(
   return 0;
 }
 
-// If the current mode being searched is NEWMV, this function will look
-// at previously searched MVs and check if they are the same
-// as the current MV. If it finds that this MV is repeated, it compares
-// the cost to the previous MV and skips the rest of the search if it is
-// more expensive.
-static int skip_repeated_newmv(
-    AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    const int do_tx_search, const PREDICTION_MODE this_mode,
-    MB_MODE_INFO *best_mbmi, motion_mode_candidate *motion_mode_cand,
-    int64_t *ref_best_rd, RD_STATS *best_rd_stats, RD_STATS *best_rd_stats_y,
-    RD_STATS *best_rd_stats_uv, inter_mode_info *mode_info,
-    HandleInterModeArgs *args, int drl_cost, const int *refs, int_mv *cur_mv,
-    int64_t *best_rd, const BUFFER_SET orig_dst, int ref_mv_idx) {
-  // This feature only works for NEWMV when a previous mv has been searched
-  if (this_mode != NEWMV || ref_mv_idx == 0) return 0;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  int skip = 0;
-  int this_rate_mv = 0;
-  int i;
-  for (i = 0; i < ref_mv_idx; ++i) {
-    // Check if the motion search result same as previous results
-    if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
-        args->single_newmv_valid[i][refs[0]]) {
-      // If the compared mode has no valid rd, it is unlikely this
-      // mode will be the best mode
-      if (mode_info[i].rd == INT64_MAX) {
-        skip = 1;
-        break;
-      }
-      // Compare the cost difference including drl cost and mv cost
-      if (mode_info[i].mv.as_int != INVALID_MV) {
-        const int compare_cost = mode_info[i].rate_mv + mode_info[i].drl_cost;
-        const int_mv ref_mv = av1_get_ref_mv(x, 0);
-        this_rate_mv = av1_mv_bit_cost(
-            &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
-            x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
-        const int this_cost = this_rate_mv + drl_cost;
-
-        if (compare_cost <= this_cost) {
-          // Skip this mode if it is more expensive as the previous result
-          // for this MV
-          skip = 1;
-          break;
-        } else {
-          // If the cost is less than current best result, make this
-          // the best and update corresponding variables unless the
-          // best_mv is the same as ref_mv. In this case we skip and
-          // rely on NEAR(EST)MV instead
-          if (best_mbmi->ref_mv_idx == i &&
-              best_mbmi->mv[0].as_int != ref_mv.as_int) {
-            assert(*best_rd != INT64_MAX);
-            assert(best_mbmi->mv[0].as_int == mode_info[i].mv.as_int);
-            best_mbmi->ref_mv_idx = ref_mv_idx;
-            motion_mode_cand->rate_mv = this_rate_mv;
-            best_rd_stats->rate += this_cost - compare_cost;
-            *best_rd =
-                RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
-            // We also need to update mode_info here because we are setting
-            // (ref_)best_rd here. So we will not be able to search the same
-            // mode again with the current configuration.
-            mode_info[ref_mv_idx].mv.as_int = best_mbmi->mv[0].as_int;
-            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-            mode_info[ref_mv_idx].rd = *best_rd;
-            if (*best_rd < *ref_best_rd) *ref_best_rd = *best_rd;
-            break;
-          }
-        }
-      }
-    }
-  }
-  if (skip) {
-    const THR_MODES mode_enum = get_prediction_mode_idx(
-        best_mbmi->mode, best_mbmi->ref_frame[0], best_mbmi->ref_frame[1]);
-    // Collect mode stats for multiwinner mode processing
-    store_winner_mode_stats(
-        &cpi->common, x, best_mbmi, best_rd_stats, best_rd_stats_y,
-        best_rd_stats_uv, mode_enum, NULL, bsize, *best_rd,
-        cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
-    args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
-        args->modelled_rd[this_mode][i][refs[0]];
-    args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
-        args->simple_rd[this_mode][i][refs[0]];
-    mode_info[ref_mv_idx].rd = mode_info[i].rd;
-    mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-    mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
-
-    restore_dst_buf(xd, orig_dst, num_planes);
-    return 1;
-  }
-  return 0;
-}
-
 /*!\brief High level function to select parameters for compound mode.
  *
  * \ingroup inter_mode_search
@@ -2427,7 +2345,7 @@ static int process_compound_inter_mode(
   MB_MODE_INFO *mbmi = xd->mi[0];
   const AV1_COMMON *cm = &cpi->common;
   const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                   cm->seq_params.enable_masked_compound;
+                                   cm->seq_params->enable_masked_compound;
   int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
                          (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
 
@@ -2506,6 +2424,76 @@ static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
   return 0;
 }
 
+/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE
+ *
+ * \ingroup inter_mode_search
+ *
+ * Compares the sse of zero mv and the best sse found in single new_mv. If the
+ * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped.
+ * Else returns 0.
+ *
+ * Note that the sse of here comes from single_motion_search. So it is
+ * interpolated with the filter in motion search, not the actual interpolation
+ * filter used in encoding.
+ *
+ * \param[in]     fn_ptr            A table of function pointers to compute SSE.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             The current block_size.
+ * \param[in]     args              The args to handle_inter_mode, used to track
+ *                                  the best SSE.
+ * \return Returns 1 if zero_mv is pruned, 0 otherwise.
+ */
+static AOM_INLINE int prune_zero_mv_with_sse(
+    const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
+    const HandleInterModeArgs *args) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  const int is_comp_pred = has_second_ref(mbmi);
+  const MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+
+  // Check that the global mv is the same as ZEROMV
+  assert(mbmi->mv[0].as_int == 0);
+  assert(IMPLIES(is_comp_pred, mbmi->mv[0].as_int == 0));
+  assert(xd->global_motion[refs[0]].wmtype == TRANSLATION ||
+         xd->global_motion[refs[0]].wmtype == IDENTITY);
+
+  // Don't prune if we have invalid data
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    assert(mbmi->mv[0].as_int == 0);
+    if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) {
+      return 0;
+    }
+  }
+
+  // Sum up the sse of ZEROMV and best NEWMV
+  unsigned int this_sse_sum = 0;
+  unsigned int best_sse_sum = 0;
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+    const struct macroblockd_plane *pd = xd->plane;
+    const struct buf_2d *src_buf = &p->src;
+    const struct buf_2d *ref_buf = &pd->pre[idx];
+    const uint8_t *src = src_buf->buf;
+    const uint8_t *ref = ref_buf->buf;
+    const int src_stride = src_buf->stride;
+    const int ref_stride = ref_buf->stride;
+
+    unsigned int this_sse;
+    fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse);
+    this_sse_sum += this_sse;
+
+    const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]];
+    best_sse_sum += best_sse;
+  }
+  if (this_sse_sum > best_sse_sum) {
+    return 1;
+  }
+
+  return 0;
+}
+
 /*!\brief AV1 inter mode RD computation
  *
  * \ingroup inter_mode_search
@@ -2589,12 +2577,11 @@ static int64_t handle_inter_mode(
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const int tpl_idx = gf_group->index;
-  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const int prune_modes_based_on_tpl =
       cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
-      tpl_idx < MAX_TPL_FRAME_IDX && tpl_frame->is_valid;
+      tpl_idx < MAX_TPL_FRAME_IDX && tpl_data->tpl_frame[tpl_idx].is_valid;
   int i;
   // Reference frames for this mode
   const int refs[2] = { mbmi->ref_frame[0],
@@ -2606,10 +2593,10 @@ static int64_t handle_inter_mode(
   // of these currently holds the best predictor, and use the other
   // one for future predictions. In the end, copy from tmp_buf to
   // dst if necessary.
-  struct macroblockd_plane *p = xd->plane;
+  struct macroblockd_plane *pd = xd->plane;
   const BUFFER_SET orig_dst = {
-    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
-    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+    { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+    { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
   };
   const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
                                  tmp_buf + 2 * MAX_SB_SQUARE },
@@ -2645,8 +2632,8 @@ static int64_t handle_inter_mode(
   // Save MV results from first 2 ref_mv_idx.
   int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
   int best_ref_mv_idx = -1;
-  const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd,
-                                            mode_info, bsize, ref_set);
+  const int idx_mask =
+      ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set);
   const int16_t mode_ctx =
       av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
   const ModeCosts *mode_costs = &x->mode_costs;
@@ -2669,9 +2656,14 @@ static int64_t handle_inter_mode(
   //        WARPED_CAUSAL)
   //    6.) Update stats if best so far
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mbmi->ref_mv_idx = ref_mv_idx;
+
     mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
-    mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
-    mode_info[ref_mv_idx].rd = INT64_MAX;
+    mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX;
+    const int drl_cost = get_drl_cost(
+        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+    mode_info[ref_mv_idx].skip = 0;
 
     if (!mask_check_bit(idx_mask, ref_mv_idx)) {
       // MV did not perform well in simple translation search. Skip it.
@@ -2695,14 +2687,10 @@ static int64_t handle_inter_mode(
 
     mbmi->num_proj_ref = 0;
     mbmi->motion_mode = SIMPLE_TRANSLATION;
-    mbmi->ref_mv_idx = ref_mv_idx;
 
     // Compute cost for signalling this DRL index
     rd_stats->rate = base_rate;
-    const int drl_cost = get_drl_cost(
-        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
     rd_stats->rate += drl_cost;
-    mode_info[ref_mv_idx].drl_cost = drl_cost;
 
     int rs = 0;
     int compmode_interinter_cost = 0;
@@ -2731,17 +2719,16 @@ static int64_t handle_inter_mode(
 
       if (newmv_ret_val != 0) continue;
 
-      rd_stats->rate += rate_mv;
+      if (is_inter_singleref_mode(this_mode) &&
+          cur_mv[0].as_int != INVALID_MV) {
+        const MV_REFERENCE_FRAME ref = refs[0];
+        const unsigned int this_sse = x->pred_sse[ref];
+        if (this_sse < args->best_single_sse_in_refs[ref]) {
+          args->best_single_sse_in_refs[ref] = this_sse;
+        }
+      }
 
-      // skip NEWMV mode in drl if the motion search result is the same
-      // as a previous result
-      if (cpi->sf.inter_sf.skip_repeated_newmv &&
-          skip_repeated_newmv(cpi, x, bsize, do_tx_search, this_mode,
-                              &best_mbmi, motion_mode_cand, &ref_best_rd,
-                              &best_rd_stats, &best_rd_stats_y,
-                              &best_rd_stats_uv, mode_info, args, drl_cost,
-                              refs, cur_mv, &best_rd, orig_dst, ref_mv_idx))
-        continue;
+      rd_stats->rate += rate_mv;
     }
     // Copy the motion vector for this mode into mbmi struct
     for (i = 0; i < is_comp_pred + 1; ++i) {
@@ -2760,6 +2747,14 @@ static int64_t handle_inter_mode(
                                 cpi->sf.inter_sf.prune_ref_mv_idx_search))
       continue;
 
+    if (cpi->sf.gm_sf.prune_zero_mv_with_sse &&
+        cpi->sf.gm_sf.gm_search_type == GM_DISABLE_SEARCH &&
+        (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+      if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args)) {
+        continue;
+      }
+    }
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, compound_type_rd_time);
 #endif
@@ -2843,12 +2838,6 @@ static int64_t handle_inter_mode(
 
     if (ret_val != INT64_MAX) {
       int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      if (tmp_rd < mode_info[ref_mv_idx].rd) {
-        // Only update mode_info if the new result is actually better.
-        mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
-        mode_info[ref_mv_idx].rate_mv = rate_mv;
-        mode_info[ref_mv_idx].rd = tmp_rd;
-      }
       const THR_MODES mode_enum = get_prediction_mode_idx(
           mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
       // Collect mode stats for multiwinner mode processing
@@ -2928,11 +2917,11 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   const int mi_col = xd->mi_col;
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
-  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+  const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
 
   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
-  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                    xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
                    mbmi_ext->mode_context);
@@ -2952,7 +2941,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
   if (dv_ref.as_int == 0) {
-    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row);
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row);
   }
   // Ref DV should not have sub-pel.
   assert((dv_ref.as_mv.col & 7) == 0);
@@ -2983,7 +2972,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
                                      &dv_ref.as_mv, lookahead_search_sites,
                                      /*fine_search_interval=*/0);
-  const IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
+  const IntraBCMVCosts *const dv_costs = x->dv_costs;
   av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
 
   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
@@ -2997,19 +2986,19 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
         fullms_params.mv_limits.row_min =
             (tile->mi_row_start - mi_row) * MI_SIZE;
         fullms_params.mv_limits.row_max =
-            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+            (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h;
         break;
       case IBC_MOTION_LEFT:
         fullms_params.mv_limits.col_min =
             (tile->mi_col_start - mi_col) * MI_SIZE;
         fullms_params.mv_limits.col_max =
-            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+            (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w;
         // TODO(aconverse@google.com): Minimize the overlap between above and
         // left areas.
         fullms_params.mv_limits.row_min =
             (tile->mi_row_start - mi_row) * MI_SIZE;
         int bottom_coded_mi_edge =
-            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+            AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end);
         fullms_params.mv_limits.row_max =
             (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
         break;
@@ -3047,7 +3036,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                 get_fullmv_from_mv(&dv)))
       continue;
     if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
-                         cm->seq_params.mib_size_log2))
+                         cm->seq_params->mib_size_log2))
       continue;
 
     // DV should not have sub-pel.
@@ -3065,12 +3054,10 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                   av1_num_planes(cm) - 1);
 
-    int *dvcost[2] = { (int *)&dv_costs->mv_component[0][MV_MAX],
-                       (int *)&dv_costs->mv_component[1][MV_MAX] };
     // TODO(aconverse@google.com): The full motion field defining discount
     // in MV_COST_WEIGHT is too large. Explore other values.
     const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
-                                        dvcost, MV_COST_WEIGHT_SUB);
+                                        dv_costs->dv_costs, MV_COST_WEIGHT_SUB);
     const int rate_mode = x->mode_costs.intrabc_cost[1];
     RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
     if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
@@ -3186,7 +3173,6 @@ static AOM_INLINE void rd_pick_skip_mode(
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
 
   x->compound_idx = 1;  // COMPOUND_AVERAGE
   RD_STATS skip_mode_rd_stats;
@@ -3247,6 +3233,8 @@ static AOM_INLINE void rd_pick_skip_mode(
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->ref_mv_idx = 0;
   mbmi->skip_mode = mbmi->skip_txfm = 1;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
 
   set_default_interp_filters(mbmi, cm->features.interp_filter);
 
@@ -3283,45 +3271,12 @@ static AOM_INLINE void rd_pick_skip_mode(
     assert(mode_index != THR_INVALID);
     search_state->best_mbmode.skip_mode = 1;
     search_state->best_mbmode = *mbmi;
-
-    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip_txfm =
-        1;
-    search_state->best_mbmode.mode = NEAREST_NEARESTMV;
-    search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
-    search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
-    search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
-    search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
-    search_state->best_mbmode.ref_mv_idx = 0;
-
-    // Set up tx_size related variables for skip-specific loop filtering.
-    search_state->best_mbmode.tx_size =
-        block_signals_txsize(bsize)
-            ? tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type)
-            : max_txsize_rect_lookup[bsize];
     memset(search_state->best_mbmode.inter_tx_size,
            search_state->best_mbmode.tx_size,
            sizeof(search_state->best_mbmode.inter_tx_size));
     set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
                   search_state->best_mbmode.skip_txfm && is_inter_block(mbmi),
                   xd);
-
-    // Set up color-related variables for skip mode.
-    search_state->best_mbmode.uv_mode = UV_DC_PRED;
-    search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
-    search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
-
-    search_state->best_mbmode.comp_group_idx = 0;
-    search_state->best_mbmode.compound_idx = x->compound_idx;
-    search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
-    search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
-
-    search_state->best_mbmode.interintra_mode =
-        (INTERINTRA_MODE)(II_DC_PRED - 1);
-    search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
-
-    set_default_interp_filters(&search_state->best_mbmode,
-                               cm->features.interp_filter);
-
     search_state->best_mode_index = mode_index;
 
     // Update rd_cost
@@ -3798,7 +3753,7 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
       // compound ref.
       if (skip_ref_frame_mask & (1 << ref_frame) &&
           !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) &&
-          !is_ref_frame_used_in_cache(ref_frame, x->intermode_cache)) {
+          !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
         continue;
       }
       assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
@@ -3824,7 +3779,7 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
       }
 
       if (skip_ref_frame_mask & (1 << ref_frame) &&
-          !is_ref_frame_used_in_cache(ref_frame, x->intermode_cache)) {
+          !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
         continue;
       }
       // Ref mv list population is not required, when compound references are
@@ -3841,9 +3796,16 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
   }
 
   av1_count_overlappable_neighbors(cm, xd);
-  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-  const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
-                         cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  int obmc_probability;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  obmc_probability = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize];
+#else
+  obmc_probability = cpi->frame_probs.obmc_probs[update_type][bsize];
+#endif
+  const int prune_obmc =
+      obmc_probability < cpi->sf.inter_sf.prune_obmc_prob_thresh;
   if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) {
     if (check_num_overlappable_neighbors(mbmi) &&
         is_motion_variation_allowed_bsize(bsize)) {
@@ -3874,6 +3836,10 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
   set_mode_eval_params(cpi, x, MODE_EVAL);
 
   x->comp_rd_stats_idx = 0;
+
+  for (int idx = 0; idx < REF_FRAMES; idx++) {
+    args->best_single_sse_in_refs[idx] = INT32_MAX;
+  }
 }
 
 static AOM_INLINE void init_inter_mode_search_state(
@@ -4060,8 +4026,8 @@ static int inter_mode_search_order_independent_skip(
   }
 
   // Reuse the prediction mode in cache
-  if (x->use_intermode_cache) {
-    const MB_MODE_INFO *cached_mi = x->intermode_cache;
+  if (x->use_mb_mode_cache) {
+    const MB_MODE_INFO *cached_mi = x->mb_mode_cache;
     const PREDICTION_MODE cached_mode = cached_mi->mode;
     const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
     const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
@@ -4156,12 +4122,12 @@ static int inter_mode_search_order_independent_skip(
     }
     // If we are reusing the prediction from cache, and the current frame is
     // required by the cache, then we cannot prune it.
-    if (is_ref_frame_used_in_cache(ref_type, x->intermode_cache)) {
+    if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) {
       skip_ref = 0;
       // If the cache only needs the current reference type for compound
       // prediction, then we can skip motion mode search.
       skip_motion_mode = (ref_type <= ALTREF_FRAME &&
-                          x->intermode_cache->ref_frame[1] > INTRA_FRAME);
+                          x->mb_mode_cache->ref_frame[1] > INTRA_FRAME);
     }
     if (skip_ref) return 1;
   }
@@ -4452,12 +4418,14 @@ static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
 // Prune compound mode using ref frames of neighbor blocks.
 static INLINE int compound_skip_using_neighbor_refs(
     MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
-    const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) {
+    const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
   // Exclude non-extended compound modes from pruning
   if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
       this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
     return 0;
 
+  if (prune_ext_comp_using_neighbors >= 3) return 1;
+
   int is_ref_match[2] = { 0 };  // 0 - match for forward refs
                                 // 1 - match for backward refs
   // Check if ref frames of this block matches with left neighbor.
@@ -4472,7 +4440,7 @@ static INLINE int compound_skip_using_neighbor_refs(
   const int track_ref_match = is_ref_match[0] + is_ref_match[1];
 
   // Pruning based on ref frame match with neighbors.
-  if (track_ref_match >= prune_compound_using_neighbors) return 0;
+  if (track_ref_match >= prune_ext_comp_using_neighbors) return 0;
   return 1;
 }
 
@@ -4629,10 +4597,10 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
     if (!is_inter_singleref_mode(mbmi->mode)) continue;
 
     x->txfm_search_info.skip_txfm = 0;
-    struct macroblockd_plane *p = xd->plane;
+    struct macroblockd_plane *pd = xd->plane;
     const BUFFER_SET orig_dst = {
-      { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
-      { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+      { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+      { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
     };
 
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
@@ -4681,8 +4649,7 @@ typedef struct {
   int skip_ref_frame_mask;
   int reach_first_comp_mode;
   int mode_thresh_mul_fact;
-  int intra_mode_idx_ls[INTRA_MODES];
-  int intra_mode_num;
+  int *intra_mode_idx_ls;
   int num_single_modes_processed;
   int prune_cpd_using_sr_stats_ready;
 } InterModeSFArgs;
@@ -4693,7 +4660,6 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
                            InterModeSFArgs *args) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
   // Get the actual prediction mode we are trying in this iteration
   const THR_MODES mode_enum = av1_default_mode_order[midx];
   const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
@@ -4703,6 +4669,8 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
   const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
   const int comp_pred = second_ref_frame > INTRA_FRAME;
 
+  if (ref_frame == INTRA_FRAME) return 1;
+
   // Check if this mode should be skipped because it is incompatible with the
   // current frame
   if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
@@ -4739,23 +4707,6 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
       return 1;
   }
 
-  // Speed features to prune out INTRA frames
-  if (ref_frame == INTRA_FRAME) {
-    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
-         sf->intra_sf.disable_smooth_intra) &&
-        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-         mbmi->mode == SMOOTH_V_PRED))
-      return 1;
-    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
-        mbmi->mode == PAETH_PRED)
-      return 1;
-
-    // Intra modes will be handled in another loop later.
-    assert(args->intra_mode_num < INTRA_MODES);
-    args->intra_mode_idx_ls[args->intra_mode_num++] = mode_enum;
-    return 1;
-  }
-
   if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
     // After we done with single reference modes, find the 2nd best RD
     // for a reference frame. Only search compound modes that have a reference
@@ -4770,10 +4721,10 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
       return 1;
   }
 
-  if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) {
+  if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) {
     if (compound_skip_using_neighbor_refs(
             xd, this_mode, ref_frames,
-            sf->inter_sf.prune_compound_using_neighbors))
+            sf->inter_sf.prune_ext_comp_using_neighbors))
       return 1;
   }
 
@@ -4851,8 +4802,9 @@ static void tx_search_best_inter_candidates(
           : INT64_MAX;
   *yrd = INT64_MAX;
   int64_t best_rd_in_this_partition = INT64_MAX;
+  int num_inter_mode_cands = inter_modes_info->num;
   // Iterate over best inter mode candidates and perform tx search
-  for (int j = 0; j < inter_modes_info->num; ++j) {
+  for (int j = 0; j < num_inter_mode_cands; ++j) {
     const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
     *mbmi = inter_modes_info->mbmi_arr[data_idx];
     int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
@@ -4930,6 +4882,27 @@ static void tx_search_best_inter_candidates(
       update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                           &rd_stats_uv, mode_enum, x, txfm_search_done);
       search_state->best_skip_rd[0] = skip_rd;
+      // Limit the total number of modes to be evaluated if the first is valid
+      // and transform skip or compound
+      if (cpi->sf.inter_sf.inter_mode_txfm_breakout) {
+        if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) {
+          // Evaluate more candidates at high quantizers where occurrence of
+          // transform skip is high.
+          const int max_cands_cap[5] = { 2, 3, 5, 7, 9 };
+          const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands =
+              AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num);
+        } else if (!j && has_second_ref(&search_state->best_mbmode)) {
+          const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1;
+          // Evaluate more candidates at low quantizers where occurrence of
+          // single reference mode is high.
+          const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 },
+                                                { 10, 7, 5, 3 } };
+          const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands = AOMMIN(
+              max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num);
+        }
+      }
     }
   }
 }
@@ -5050,13 +5023,41 @@ static AOM_INLINE void search_intra_modes_in_interframe(
   const int num_4x4 = bsize_to_num_blk(bsize);
 
   // Performs luma search
-  for (int j = 0; j < sf_args->intra_mode_num; ++j) {
+  int64_t best_model_rd = INT64_MAX;
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
     if (sf->intra_sf.skip_intra_in_interframe &&
         search_state->intra_search_state.skip_intra_modes)
       break;
-    const THR_MODES mode_enum = sf_args->intra_mode_idx_ls[j];
-    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
-    const PREDICTION_MODE this_mode = mode_def->mode;
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
+
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode))
+      continue;
+
+    THR_MODES mode_enum = 0;
+    for (int i = 0; i < INTRA_MODE_END; ++i) {
+      if (mbmi->mode == av1_mode_defs[sf_args->intra_mode_idx_ls[i]].mode) {
+        mode_enum = sf_args->intra_mode_idx_ls[i];
+        break;
+      }
+    }
+    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
+        mbmi->mode == PAETH_PRED)
+      continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        av1_use_angle_delta(bsize) == 0 && mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+    const PREDICTION_MODE this_mode = mbmi->mode;
 
     assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
     assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
@@ -5084,7 +5085,8 @@ static AOM_INLINE void search_intra_modes_in_interframe(
     int64_t intra_rd_y = INT64_MAX;
     const int is_luma_result_valid = av1_handle_intra_y_mode(
         intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
-        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y);
+        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y,
+        &best_model_rd, top_intra_model_rd);
     if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
       is_best_y_mode_intra = 1;
       if (intra_rd_y < best_rd_y) {
@@ -5147,12 +5149,6 @@ static AOM_INLINE void search_intra_modes_in_interframe(
         intra_rd_stats_uv.rate +
         intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
   }
-  if (mode != DC_PRED && mode != PAETH_PRED) {
-    const int intra_cost_penalty = av1_get_intra_cost_penalty(
-        cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
-        cm->seq_params.bit_depth);
-    intra_rd_stats.rate += intra_cost_penalty;
-  }
 
   // Intra block is always coded as non-skip
   intra_rd_stats.skip_txfm = 0;
@@ -5189,6 +5185,84 @@ static AOM_INLINE void search_intra_modes_in_interframe(
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
+// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
+// features in intra mode pruning.
+static AOM_INLINE void calculate_cost_from_tpl_data(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // Only consider full SB.
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d;
+  const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+                  (block_size_high[sb_size] / tpl_bsize_1d);
+  SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (sb_enc->tpl_data_count == len) {
+    const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+    const int tpl_stride = sb_enc->tpl_stride;
+    const int tplw = mi_size_wide[tpl_bsize];
+    const int tplh = mi_size_high[tpl_bsize];
+    const int nw = mi_size_wide[bsize] / tplw;
+    const int nh = mi_size_high[bsize] / tplh;
+    if (nw >= 1 && nh >= 1) {
+      const int of_h = mi_row % mi_size_high[sb_size];
+      const int of_w = mi_col % mi_size_wide[sb_size];
+      const int start = of_h / tplh * tpl_stride + of_w / tplw;
+
+      for (int k = 0; k < nh; k++) {
+        for (int l = 0; l < nw; l++) {
+          *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+          *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
+        }
+      }
+      *inter_cost /= nw * nh;
+      *intra_cost /= nw * nh;
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
+// intra mode search.
+static AOM_INLINE void skip_intra_modes_in_interframe(
+    AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
+    InterModeSearchState *search_state, int64_t inter_cost, int64_t intra_cost,
+    int skip_intra_in_interframe) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (inter_cost >= 0 && intra_cost >= 0) {
+    aom_clear_system_state();
+    const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+                                     ? &av1_intrap_nn_config
+                                     : &av1_intrap_hd_nn_config;
+    float nn_features[6];
+    float scores[2] = { 0.0f };
+
+    nn_features[0] = (float)search_state->best_mbmode.skip_txfm;
+    nn_features[1] = (float)mi_size_wide_log2[bsize];
+    nn_features[2] = (float)mi_size_high_log2[bsize];
+    nn_features[3] = (float)intra_cost;
+    nn_features[4] = (float)inter_cost;
+    const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+    const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+    nn_features[5] = (float)(ac_q_max / ac_q);
+
+    av1_nn_predict(nn_features, nn_config, 1, scores);
+    aom_clear_system_state();
+
+    // For two parameters, the max prob returned from av1_nn_softmax equals
+    // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the
+    // calling of av1_nn_softmax.
+    const float thresh[2] = { 1.4f, 1.4f };
+    if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+    }
+  } else if ((search_state->best_mbmode.skip_txfm) &&
+             (skip_intra_in_interframe >= 2)) {
+    search_state->intra_search_state.skip_intra_modes = 1;
+  }
+}
+
 // TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
 void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
                             struct macroblock *x, struct RD_STATS *rd_cost,
@@ -5231,6 +5305,7 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
                                -1,
                                -1,
                                -1,
+                               { 0 },
                                { 0 } };
   for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
   // Indicates the appropriate number of simple translation winner modes for
@@ -5265,10 +5340,13 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
          mbmi->partition != PARTITION_HORZ) ||
         cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
       picked_ref_frames_mask =
-          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size);
+          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size);
     }
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
   // Skip ref frames that never selected by square blocks.
   const int skip_ref_frame_mask =
       picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
@@ -5280,6 +5358,9 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
   set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
                                 skip_ref_frame_mask, ref_costs_single,
                                 ref_costs_comp, yv12_mb);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
 
   int64_t best_est_rd = INT64_MAX;
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
@@ -5292,6 +5373,10 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
         cpi->sf.rt_sf.force_tx_search_off);
   InterModesInfo *inter_modes_info = x->inter_modes_info;
   inter_modes_info->num = 0;
+  int intra_mode_idx_ls[INTRA_MODES];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    intra_mode_idx_ls[i] = i + THR_DC;
+  }
 
   // Temporary buffers used by handle_inter_mode().
   uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
@@ -5337,40 +5422,13 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
   const int do_pruning =
       (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
   if (do_pruning && sf->intra_sf.skip_intra_in_interframe &&
-      cpi->oxcf.algo_cfg.enable_tpl_model) {
-    // Only consider full SB.
-    const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-    const int tpl_bsize_1d = cpi->tpl_data.tpl_bsize_1d;
-    const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
-                    (block_size_high[sb_size] / tpl_bsize_1d);
-    SuperBlockEnc *sb_enc = &x->sb_enc;
-    if (sb_enc->tpl_data_count == len) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
-      const int tpl_stride = sb_enc->tpl_stride;
-      const int tplw = mi_size_wide[tpl_bsize];
-      const int tplh = mi_size_high[tpl_bsize];
-      const int nw = mi_size_wide[bsize] / tplw;
-      const int nh = mi_size_high[bsize] / tplh;
-      if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[sb_size];
-        const int of_w = mi_col % mi_size_wide[sb_size];
-        const int start = of_h / tplh * tpl_stride + of_w / tplw;
-
-        for (int k = 0; k < nh; k++) {
-          for (int l = 0; l < nw; l++) {
-            inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
-            intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
-          }
-        }
-        inter_cost /= nw * nh;
-        intra_cost /= nw * nh;
-      }
-    }
-  }
+      cpi->oxcf.algo_cfg.enable_tpl_model)
+    calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost,
+                                 &intra_cost);
 #endif  // !CONFIG_REALTIME_ONLY
 
   // Initialize best mode stats for winner mode processing
-  av1_zero(x->winner_mode_stats);
+  av1_zero_array(x->winner_mode_stats, MAX_WINNER_MODE_COUNT_INTER);
   x->winner_mode_count = 0;
   store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
                           NULL, bsize, best_rd_so_far,
@@ -5389,20 +5447,20 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
                               skip_ref_frame_mask,
                               0,
                               mode_thresh_mul_fact,
-                              { 0 },
-                              0,
+                              intra_mode_idx_ls,
                               0,
                               0 };
   int64_t best_inter_yrd = INT64_MAX;
 
-  // This is the main loop of this function. It loops over all possible modes
-  // and calls handle_inter_mode() to compute the RD for each.
+  // This is the main loop of this function. It loops over all possible inter
+  // modes and calls handle_inter_mode() to compute the RD for each.
   // Here midx is just an iterator index that should not be used by itself
   // except to keep track of the number of modes searched. It should be used
   // with av1_default_mode_order to get the enum that defines the mode, which
   // can be used with av1_mode_defs to get the prediction mode and the ref
   // frames.
-  for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) {
+  for (THR_MODES midx = THR_INTER_MODE_START; midx < THR_INTER_MODE_END;
+       ++midx) {
     // Get the actual prediction mode we are trying in this iteration
     const THR_MODES mode_enum = av1_default_mode_order[midx];
     const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
@@ -5420,9 +5478,16 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     txfm_info->skip_txfm = 0;
     sf_args.num_single_modes_processed += is_single_pred;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, skip_inter_mode_time);
+#endif
     // Apply speed features to decide if this inter mode can be skipped
-    if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue;
+    const int is_skip_inter_mode =
+        skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, skip_inter_mode_time);
+#endif
+    if (is_skip_inter_mode) continue;
 
     // Select prediction reference frames.
     for (i = 0; i < num_planes; i++) {
@@ -5549,36 +5614,11 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
   // Gate intra mode evaluation if best of inter is skip except when source
   // variance is extremely low
   const unsigned int src_var_thresh_intra_skip = 1;
-  if (sf->intra_sf.skip_intra_in_interframe &&
-      (x->source_variance > src_var_thresh_intra_skip)) {
-    if (inter_cost >= 0 && intra_cost >= 0) {
-      aom_clear_system_state();
-      const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
-                                       ? &av1_intrap_nn_config
-                                       : &av1_intrap_hd_nn_config;
-      float nn_features[6];
-      float scores[2] = { 0.0f };
-      float probs[2] = { 0.0f };
-
-      nn_features[0] = (float)search_state.best_mbmode.skip_txfm;
-      nn_features[1] = (float)mi_size_wide_log2[bsize];
-      nn_features[2] = (float)mi_size_high_log2[bsize];
-      nn_features[3] = (float)intra_cost;
-      nn_features[4] = (float)inter_cost;
-      const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
-      const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
-      nn_features[5] = (float)(ac_q_max / ac_q);
-
-      av1_nn_predict(nn_features, nn_config, 1, scores);
-      aom_clear_system_state();
-      av1_nn_softmax(scores, probs, 2);
-
-      if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1;
-    } else if ((search_state.best_mbmode.skip_txfm) &&
-               (sf->intra_sf.skip_intra_in_interframe >= 2)) {
-      search_state.intra_search_state.skip_intra_modes = 1;
-    }
-  }
+  const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe;
+  if (skip_intra_in_interframe &&
+      (x->source_variance > src_var_thresh_intra_skip))
+    skip_intra_modes_in_interframe(cm, x, bsize, &search_state, inter_cost,
+                                   intra_cost, skip_intra_in_interframe);
 
   const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
   search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
@@ -5588,6 +5628,9 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
   end_timing(cpi, handle_intra_mode_time);
 #endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, refine_winner_mode_tx_time);
+#endif
   int winner_mode_count =
       cpi->sf.winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
   // In effect only when fast tx search speed features are enabled.
@@ -5595,6 +5638,9 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
       cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
       &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
       search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, refine_winner_mode_tx_time);
+#endif
 
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
@@ -5803,7 +5849,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filters = av1_broadcast_interp_filter(i);
         rs = av1_get_switchable_rate(x, xd, interp_filter,
-                                     cm->seq_params.enable_dual_filter);
+                                     cm->seq_params->enable_dual_filter);
         if (rs < best_rs) {
           best_rs = rs;
           best_filter = mbmi->interp_filters.as_filters.y_filter;
@@ -5814,7 +5860,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   // Set the appropriate filter
   mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
   rate2 += av1_get_switchable_rate(x, xd, interp_filter,
-                                   cm->seq_params.enable_dual_filter);
+                                   cm->seq_params->enable_dual_filter);
 
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
     rate2 += comp_inter_cost[comp_pred];
diff --git a/third_party/libaom/source/libaom/av1/encoder/rdopt.h b/third_party/libaom/source/libaom/av1/encoder/rdopt.h
index 362da7b798..055a49e9f1 100644
--- a/third_party/libaom/source/libaom/av1/encoder/rdopt.h
+++ b/third_party/libaom/source/libaom/av1/encoder/rdopt.h
@@ -217,10 +217,10 @@ static INLINE int av1_encoder_get_relative_dist(int a, int b) {
 static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
   const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
   int sb_mi_rows =
-      (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) /
+      (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
       mi_alloc_size_1d;
-  assert(mi_size_wide[cm->seq_params.sb_size] ==
-         mi_size_high[cm->seq_params.sb_size]);
+  assert(mi_size_wide[cm->seq_params->sb_size] ==
+         mi_size_high[cm->seq_params->sb_size]);
   int sb_mi_size = sb_mi_rows * sb_mi_rows;
 
   return sb_mi_size;
diff --git a/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h b/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h
index ddd180f7ed..f00037992e 100644
--- a/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h
+++ b/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h
@@ -433,8 +433,10 @@ static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
   txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
   if (!winner_mode_tx_type_pruning) return;
 
-  const int prune_mode[2][2] = { { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
-                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 } };
+  const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } };
   txfm_params->prune_2d_txfm_mode =
       prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
 }
@@ -569,7 +571,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
                                                       const MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+  if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
 
   if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,
diff --git a/third_party/libaom/source/libaom/av1/encoder/segmentation.c b/third_party/libaom/source/libaom/av1/encoder/segmentation.c
index de17d571ff..edb6ef67fa 100644
--- a/third_party/libaom/source/libaom/av1/encoder/segmentation.c
+++ b/third_party/libaom/source/libaom/av1/encoder/segmentation.c
@@ -175,6 +175,14 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
   int tile_col, tile_row, mi_row, mi_col;
+
+  if (!seg->update_map) return;
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+    seg->temporal_update = 0;
+    assert(seg->update_data == 1);
+    return;
+  }
+
   unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
   unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
   unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
@@ -194,15 +202,15 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
                  tile_info.mi_row_start * cm->mi_params.mi_stride +
                  tile_info.mi_col_start;
         for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-             mi_row += cm->seq_params.mib_size,
-            mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) {
+             mi_row += cm->seq_params->mib_size,
+            mi_ptr += cm->seq_params->mib_size * cm->mi_params.mi_stride) {
           MB_MODE_INFO **mi = mi_ptr;
           for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-               mi_col += cm->seq_params.mib_size,
-              mi += cm->seq_params.mib_size) {
+               mi_col += cm->seq_params->mib_size,
+              mi += cm->seq_params->mib_size) {
             count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
                           temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                          mi_col, cm->seq_params.sb_size);
+                          mi_col, cm->seq_params->sb_size);
           }
         }
       }
diff --git a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c
index 1c556c2a09..dbfcaabbd6 100644
--- a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c
+++ b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c
@@ -8,7 +8,6 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include <float.h>
 #include "av1/common/av1_common_int.h"
 #include "av1/encoder/sparse_linear_solver.h"
 #include "config/aom_config.h"
@@ -408,4 +407,4 @@ void av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
   aom_free(Ad);
 }
 
-#endif  // CONFIG_OPFL
+#endif  // CONFIG_OPTICAL_FLOW_API
diff --git a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h
index 3cacb51b93..a3f2f7b964 100644
--- a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h
+++ b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SPARSE_LINEAR_SOLVER_H_
-#define AV1_COMMON_SPARSE_LINEAR_SOLVER_H_
+#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -64,4 +64,4 @@ void av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
 }  // extern "C"
 #endif
 
-#endif /* AV1_COMMON_SPARSE_LINEAR_SOLVER_H_ */
+#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */
diff --git a/third_party/libaom/source/libaom/av1/encoder/speed_features.c b/third_party/libaom/source/libaom/av1/encoder/speed_features.c
index 2244aaae91..916a818513 100644
--- a/third_party/libaom/source/libaom/av1/encoder/speed_features.c
+++ b/third_party/libaom/source/libaom/av1/encoder/speed_features.c
@@ -274,6 +274,20 @@ static void set_allintra_speed_feature_framesize_dependent(
 
     sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
   }
+
+  if (speed >= 7) {
+    if (!is_480p_or_larger) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+  }
+
+  if (speed >= 8) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
+
+  if (speed >= 9) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
 }
 
 static void set_allintra_speed_features_framesize_independent(
@@ -289,8 +303,11 @@ static void set_allintra_speed_features_framesize_independent(
   sf->part_sf.prune_part4_search = 2;
   sf->part_sf.simple_motion_search_prune_rect = 1;
   sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
 
   sf->tx_sf.adaptive_txb_search_level = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
@@ -300,7 +317,7 @@ static void set_allintra_speed_features_framesize_independent(
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+  if (cpi->ppi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
       cpi->use_screen_content_tools) {
     sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
   } else {
@@ -318,10 +335,12 @@ static void set_allintra_speed_features_framesize_independent(
     // speed feature accordingly
     sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
     sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+    sf->part_sf.reuse_best_prediction_for_part_ab = 1;
 
     sf->mv_sf.exhaustive_searches_thresh <<= 1;
 
     sf->intra_sf.prune_palette_search_level = 1;
+    sf->intra_sf.top_intra_model_count_allowed = 3;
 
     sf->tx_sf.adaptive_txb_search_level = 2;
     sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
@@ -348,6 +367,7 @@ static void set_allintra_speed_features_framesize_independent(
 
     sf->intra_sf.disable_smooth_intra = 1;
     sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.prune_filter_intra_level = 1;
 
     sf->rd_sf.perform_coeff_opt = 3;
 
@@ -397,9 +417,6 @@ static void set_allintra_speed_features_framesize_independent(
     sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
     sf->intra_sf.prune_chroma_modes_using_luma_winner = 1;
 
     sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
@@ -408,7 +425,7 @@ static void set_allintra_speed_features_framesize_independent(
     sf->tpl_sf.subpel_force_stop = HALF_PEL;
     sf->tpl_sf.search_method = FAST_BIGDIA;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
     sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
@@ -443,9 +460,10 @@ static void set_allintra_speed_features_framesize_independent(
   }
 
   if (speed >= 6) {
-    sf->intra_sf.disable_filter_intra = 1;
+    sf->intra_sf.prune_filter_intra_level = 2;
     sf->intra_sf.chroma_intra_pruning_with_hog = 4;
     sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.cfl_search_range = 1;
 
     sf->part_sf.prune_rectangular_split_based_on_qidx =
         allow_screen_content_tools ? 0 : 1;
@@ -458,7 +476,7 @@ static void set_allintra_speed_features_framesize_independent(
 
     sf->mv_sf.use_bsize_dependent_search_method = 1;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
     // Use largest txfm block size for square coding blocks.
     sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
@@ -466,10 +484,39 @@ static void set_allintra_speed_features_framesize_independent(
 
     sf->rd_sf.perform_coeff_opt = 6;
     sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
 
     sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
   }
 
+  if (speed >= 7) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+    sf->rt_sf.nonrd_check_partition_split = 0;
+    sf->rt_sf.skip_intra_pred_if_tx_skip = 1;
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 8) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
+
+  if (speed >= 9) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
+
   // Intra txb hash is currently not compatible with multi-winner mode as the
   // hashes got reset during multi-winner mode processing.
   assert(IMPLIES(
@@ -480,6 +527,7 @@ static void set_allintra_speed_features_framesize_independent(
 static void set_good_speed_feature_framesize_dependent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
@@ -518,7 +566,16 @@ static void set_good_speed_feature_framesize_dependent(
     sf->mv_sf.use_downsampled_sad = 1;
   }
 
+  if (!is_720p_or_larger) {
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    const int rate_tolerance =
+        AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+    sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2);
+  }
+
   if (speed >= 1) {
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1;
+
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
     } else if (is_480p_or_larger) {
@@ -561,6 +618,12 @@ static void set_good_speed_feature_framesize_dependent(
     }
 
     if (is_480p_or_larger) {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
+    } else {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+    }
+
+    if (is_480p_or_larger) {
       sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
       if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
     } else {
@@ -573,6 +636,8 @@ static void set_good_speed_feature_framesize_dependent(
   }
 
   if (speed >= 3) {
+    sf->inter_sf.skip_newmv_in_drl = 2;
+
     sf->part_sf.ml_early_term_after_part_split_level = 0;
 
     if (is_720p_or_larger) {
@@ -584,6 +649,10 @@ static void set_good_speed_feature_framesize_dependent(
       sf->part_sf.partition_search_breakout_rate_thr = 120;
     }
     if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+
+    if (is_480p_or_larger) sf->intra_sf.top_intra_model_count_allowed = 2;
+
+    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
   }
 
   if (speed >= 4) {
@@ -598,11 +667,14 @@ static void set_good_speed_feature_framesize_dependent(
     }
 
     sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3;
 
     if (is_720p_or_larger)
       sf->hl_sf.recode_tolerance = 32;
     else
       sf->hl_sf.recode_tolerance = 55;
+
+    sf->intra_sf.top_intra_model_count_allowed = 2;
   }
 
   if (speed >= 5) {
@@ -612,6 +684,8 @@ static void set_good_speed_feature_framesize_dependent(
       sf->inter_sf.prune_warped_prob_thresh = 8;
     }
     if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40;
+
+    sf->inter_sf.skip_newmv_in_drl = 4;
   }
 
   if (speed >= 6) {
@@ -630,7 +704,9 @@ static void set_good_speed_feature_framesize_dependent(
     }
 
     if (!is_720p_or_larger) {
-      sf->inter_sf.mv_cost_upd_level = 2;
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+      sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+      sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     }
 
     if (is_720p_or_larger) {
@@ -650,10 +726,10 @@ static void set_good_speed_feature_framesize_dependent(
 static void set_good_speed_features_framesize_independent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int boosted = frame_is_boosted(cpi);
   const int is_boosted_arf2_bwd_type =
-      boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+      boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
   const int allow_screen_content_tools =
       cm->features.allow_screen_content_tools;
   const int use_hbd = cpi->oxcf.use_highbitdepth;
@@ -670,6 +746,8 @@ static void set_good_speed_features_framesize_independent(
   sf->part_sf.prune_part4_search = 2;
   sf->part_sf.simple_motion_search_prune_rect = 1;
   sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
 
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 1;
@@ -698,7 +776,7 @@ static void set_good_speed_features_framesize_independent(
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+  if (cpi->ppi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
       cpi->use_screen_content_tools) {
     sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
   } else {
@@ -725,7 +803,6 @@ static void set_good_speed_features_framesize_independent(
     sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
     sf->mv_sf.disable_extensive_joint_motion_search = 1;
 
-    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
     sf->inter_sf.prune_comp_type_by_comp_avg = 1;
     sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
@@ -736,7 +813,6 @@ static void set_good_speed_features_framesize_independent(
     sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
     sf->inter_sf.reuse_inter_intra_mode = 1;
     sf->inter_sf.selective_ref_frame = 2;
-    sf->inter_sf.skip_repeated_newmv = 1;
 
     sf->interp_sf.use_interp_filter = 1;
 
@@ -766,7 +842,11 @@ static void set_good_speed_features_framesize_independent(
   if (speed >= 2) {
     sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
+    sf->fp_sf.skip_motion_search_threshold = 25;
+
     sf->part_sf.allow_partition_search_skip = 1;
+    sf->part_sf.reuse_best_prediction_for_part_ab =
+        !frame_is_intra_only(&cpi->common);
 
     sf->mv_sf.auto_mv_step_size = 1;
     sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
@@ -778,20 +858,21 @@ static void set_good_speed_features_framesize_independent(
     // bit more closely to figure out why.
     sf->inter_sf.adaptive_rd_thresh = 1;
     sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
     sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
     sf->inter_sf.fast_interintra_wedge_search = 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
-    sf->inter_sf.prune_compound_using_neighbors = 1;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 1;
     sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
     sf->inter_sf.prune_comp_type_by_comp_avg = 2;
-    sf->inter_sf.reuse_best_prediction_for_part_ab = 1;
     sf->inter_sf.selective_ref_frame = 3;
     sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
     // Enable fast search only for COMPOUND_DIFFWTD type.
     sf->inter_sf.enable_fast_compound_mode_search = 1;
     sf->inter_sf.reuse_mask_search_results = 1;
     sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 1;
+    sf->inter_sf.disable_interinter_wedge_newmv_search =
+        is_boosted_arf2_bwd_type ? 0 : 1;
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1;
 
     // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
     sf->interp_sf.adaptive_interp_filter_search = 1;
@@ -831,7 +912,8 @@ static void set_good_speed_features_framesize_independent(
     sf->mv_sf.search_method = DIAMOND;
     sf->mv_sf.disable_second_mv = 2;
 
-    sf->inter_sf.mv_cost_upd_level = 1;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     sf->inter_sf.disable_onesided_comp = 1;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
@@ -843,10 +925,11 @@ static void set_good_speed_features_framesize_independent(
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
     sf->inter_sf.selective_ref_frame = 5;
     sf->inter_sf.skip_repeated_ref_mv = 1;
-    sf->inter_sf.skip_repeated_full_newmv = 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
     sf->inter_sf.txfm_rd_gate_level =
         boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
+    sf->inter_sf.enable_fast_wedge_mask_search = 1;
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2;
 
     sf->interp_sf.adaptive_interp_filter_search = 2;
 
@@ -865,6 +948,8 @@ static void set_good_speed_features_framesize_independent(
     sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
     sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
     sf->tx_sf.use_intra_txb_hash = 1;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
 
     // TODO(any): Refactor the code related to following winner mode speed
     // features
@@ -874,10 +959,10 @@ static void set_good_speed_features_framesize_independent(
         frame_is_intra_only(&cpi->common) ? 0 : 1;
     sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
     sf->winner_mode_sf.motion_mode_for_winner_cand =
-        boosted
-            ? 0
-            : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1
-                                                                         : 2;
+        boosted ? 0
+                : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE
+                      ? 1
+                      : 2;
 
     // TODO(any): evaluate if these lpf features can be moved to speed 2.
     // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
@@ -889,6 +974,8 @@ static void set_good_speed_features_framesize_independent(
   }
 
   if (speed >= 4) {
+    sf->gm_sf.prune_zero_mv_with_sse = 1;
+
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
 
     sf->part_sf.simple_motion_search_prune_agg = 2;
@@ -901,7 +988,7 @@ static void set_good_speed_features_framesize_independent(
     sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 3;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
-    sf->inter_sf.prune_compound_using_neighbors = 2;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 2;
     sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
 
     sf->interp_sf.cb_pred_filter_search = 1;
@@ -911,9 +998,10 @@ static void set_good_speed_features_framesize_independent(
     sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4.
+    // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
     // TODO(any): Experiment with this speed feature set to 2 for higher quality
     // presets as well
     sf->intra_sf.skip_intra_in_interframe = 2;
@@ -923,10 +1011,10 @@ static void set_good_speed_features_framesize_independent(
     sf->tpl_sf.prune_starting_mv = 2;
     sf->tpl_sf.subpel_force_stop = HALF_PEL;
     sf->tpl_sf.search_method = FAST_BIGDIA;
+    sf->tpl_sf.gop_length_decision_method = 1;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
     // TODO(any): Experiment with enabling of this speed feature as hash state
     // is reset during winner mode processing
@@ -948,9 +1036,14 @@ static void set_good_speed_features_framesize_independent(
   }
 
   if (speed >= 5) {
+    sf->fp_sf.reduce_mv_step_param = 4;
+
     sf->part_sf.simple_motion_search_prune_agg = 3;
     sf->part_sf.ext_partition_eval_thresh =
         allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+                                                                          : 2;
 
     sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
@@ -974,8 +1067,11 @@ static void set_good_speed_features_framesize_independent(
     sf->tpl_sf.prune_starting_mv = 3;
     sf->tpl_sf.use_y_only_rate_distortion = 1;
     sf->tpl_sf.subpel_force_stop = FULL_PEL;
+    sf->tpl_sf.gop_length_decision_method = 2;
 
     sf->winner_mode_sf.dc_blk_pred_level = 1;
+
+    sf->fp_sf.disable_recon = 1;
   }
 
   if (speed >= 6) {
@@ -986,9 +1082,14 @@ static void set_good_speed_features_framesize_independent(
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
     sf->inter_sf.prune_nearmv_using_neighbors = 1;
     sf->inter_sf.selective_ref_frame = 6;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 3;
 
     sf->intra_sf.chroma_intra_pruning_with_hog = 4;
     sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC;
 
     sf->part_sf.prune_rectangular_split_based_on_qidx =
         boosted || allow_screen_content_tools ? 0 : 1;
@@ -1000,10 +1101,10 @@ static void set_good_speed_features_framesize_independent(
     sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
     sf->mv_sf.use_bsize_dependent_search_method = 1;
 
-    sf->tpl_sf.disable_gop_length_decision = 1;
+    sf->tpl_sf.gop_length_decision_method = 3;
     sf->tpl_sf.disable_filtered_key_tpl = 1;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
     sf->tx_sf.use_intra_txb_hash = 1;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
 
@@ -1052,10 +1153,13 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
 #endif
     }
   } else {
-    if (speed == 8 && !cpi->use_svc) {
+    if (speed == 8 && !cpi->ppi->use_svc) {
       sf->rt_sf.short_circuit_low_temp_var = 0;
       sf->rt_sf.use_nonrd_altref_frame = 1;
     }
+    if (speed >= 9) {
+      sf->rt_sf.skip_cdef_sb = 1;
+    }
   }
   if (!is_480p_or_larger) {
     if (speed == 7) {
@@ -1088,6 +1192,8 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
   sf->part_sf.less_rectangular_check_level = 1;
   sf->part_sf.ml_prune_partition = 1;
   sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
 
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 0;
@@ -1103,6 +1209,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
 
   sf->interp_sf.use_fast_interpolation_filter_search = 1;
 
+  sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
   sf->intra_sf.intra_pruning_with_hog = 1;
 
   sf->mv_sf.full_pixel_search_level = 1;
@@ -1140,7 +1247,6 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->inter_sf.prune_comp_search_by_single_result = 1;
     sf->inter_sf.reuse_inter_intra_mode = 1;
     sf->inter_sf.selective_ref_frame = 2;
-    sf->inter_sf.skip_repeated_newmv = 1;
     sf->inter_sf.disable_interintra_wedge_var_thresh = 0;
     sf->inter_sf.disable_interinter_wedge_var_thresh = 0;
     sf->inter_sf.prune_comp_type_by_comp_avg = 1;
@@ -1191,7 +1297,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
   if (speed >= 3) {
     sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
-    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
 
     sf->part_sf.less_rectangular_check_level = 2;
 
@@ -1202,7 +1308,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
     // sf->mv_sf.adaptive_motion_search = 1;
 
     sf->inter_sf.adaptive_rd_thresh = 2;
-    sf->inter_sf.mv_cost_upd_level = 1;
+    sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
     sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
@@ -1306,12 +1412,20 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->part_sf.default_min_partition_size = BLOCK_8X8;
     sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
 
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
     sf->mv_sf.search_method = FAST_DIAMOND;
     sf->mv_sf.subpel_force_stop = QUARTER_PEL;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
 
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
 
+    // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
+    // good. May need more study.
+    for (int i = 0; i < TX_SIZES; ++i) {
+      sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
+    }
+
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
@@ -1348,7 +1462,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
     // TODO(marpan): Look into why enabling skip_loopfilter_non_reference is
     // not bitexact on rtc testset, its very close (< ~0.01 bdrate), but not
     // always bitexact.
-    if (cpi->use_svc && cpi->svc.non_reference_frame &&
+    if (cpi->ppi->use_svc && cpi->svc.non_reference_frame &&
         sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
         sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q)
       sf->rt_sf.skip_loopfilter_non_reference = 1;
@@ -1398,8 +1512,14 @@ static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
   hl_sf->second_alt_ref_filtering = 1;
 }
 
+static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+  fp_sf->reduce_mv_step_param = 3;
+  fp_sf->skip_motion_search_threshold = 0;
+  fp_sf->disable_recon = 0;
+}
+
 static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
-  tpl_sf->disable_gop_length_decision = 0;
+  tpl_sf->gop_length_decision_method = 0;
   tpl_sf->prune_intra_modes = 0;
   tpl_sf->prune_starting_mv = 0;
   tpl_sf->reduce_first_step_size = 0;
@@ -1415,6 +1535,7 @@ static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
 static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
   gm_sf->gm_search_type = GM_FULL_SEARCH;
   gm_sf->prune_ref_frame_for_gm_search = 0;
+  gm_sf->prune_zero_mv_with_sse = 0;
 }
 
 static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
@@ -1454,6 +1575,9 @@ static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->ml_predict_breakout_level = 0;
   part_sf->prune_sub_8x8_partition_level = 0;
   part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
@@ -1487,16 +1611,17 @@ static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
   inter_sf->fast_wedge_sign_estimate = 0;
   inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
   inter_sf->reuse_inter_intra_mode = 0;
-  inter_sf->mv_cost_upd_level = 0;
+  inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB;
   inter_sf->prune_inter_modes_based_on_tpl = 0;
   inter_sf->prune_nearmv_using_neighbors = 0;
   inter_sf->prune_comp_search_by_single_result = 0;
   inter_sf->skip_repeated_ref_mv = 0;
-  inter_sf->skip_repeated_newmv = 0;
-  inter_sf->skip_repeated_full_newmv = 0;
+  inter_sf->skip_newmv_in_drl = 0;
   inter_sf->inter_mode_rd_model_estimation = 0;
   inter_sf->prune_compound_using_single_ref = 0;
-  inter_sf->prune_compound_using_neighbors = 0;
+  inter_sf->prune_ext_comp_using_neighbors = 0;
   inter_sf->prune_comp_using_best_single_mode_ref = 0;
   inter_sf->disable_onesided_comp = 0;
   inter_sf->prune_mode_search_simple_translation = 0;
@@ -1514,9 +1639,10 @@ static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
   inter_sf->txfm_rd_gate_level = 0;
   inter_sf->prune_inter_modes_if_skippable = 0;
   inter_sf->disable_masked_comp = 0;
-  inter_sf->reuse_best_prediction_for_part_ab = 0;
   inter_sf->enable_fast_compound_mode_search = 0;
   inter_sf->reuse_mask_search_results = 0;
+  inter_sf->enable_fast_wedge_mask_search = 0;
+  inter_sf->inter_mode_txfm_breakout = 0;
 }
 
 static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
@@ -1529,6 +1655,7 @@ static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
 }
 
 static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB;
   intra_sf->skip_intra_in_interframe = 1;
   intra_sf->intra_pruning_with_hog = 0;
   intra_sf->chroma_intra_pruning_with_hog = 0;
@@ -1539,8 +1666,10 @@ static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
     intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
   }
   intra_sf->disable_smooth_intra = 0;
-  intra_sf->disable_filter_intra = 0;
+  intra_sf->prune_filter_intra_level = 0;
   intra_sf->prune_chroma_modes_using_luma_winner = 0;
+  intra_sf->cfl_search_range = 3;
+  intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT;
 }
 
 static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
@@ -1650,9 +1779,11 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
       break;
   }
 
-  if (!cpi->seq_params_locked) {
-    cpi->common.seq_params.enable_masked_compound &=
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->enable_masked_compound &=
         !sf->inter_sf.disable_masked_comp;
+    cpi->common.seq_params->enable_interintra_compound &=
+        (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
   // This is only used in motion vector unit test.
@@ -1662,9 +1793,9 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
     cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 
   if ((cpi->oxcf.row_mt == 1) && (cpi->oxcf.max_threads > 1)) {
-    if (sf->inter_sf.mv_cost_upd_level > 1) {
+    if (sf->inter_sf.mv_cost_upd_level < INTERNAL_COST_UPD_SBROW) {
       // Set mv_cost_upd_level to use row level update.
-      sf->inter_sf.mv_cost_upd_level = 1;
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     }
   }
 }
@@ -1676,6 +1807,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   int i;
 
   init_hl_sf(&sf->hl_sf);
+  init_fp_sf(&sf->fp_sf);
   init_tpl_sf(&sf->tpl_sf);
   init_gm_sf(&sf->gm_sf);
   init_part_sf(&sf->part_sf);
@@ -1701,12 +1833,12 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
       break;
   }
 
-  if (!cpi->seq_params_locked) {
-    cpi->common.seq_params.enable_dual_filter &=
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->enable_dual_filter &=
         !sf->interp_sf.disable_dual_filter;
-    cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+    cpi->common.seq_params->enable_restoration &= !sf->lpf_sf.disable_lr_filter;
 
-    cpi->common.seq_params.enable_interintra_compound &=
+    cpi->common.seq_params->enable_interintra_compound &=
         (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
@@ -1821,10 +1953,11 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const int boosted = frame_is_boosted(cpi);
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_arf2_bwd_type =
-      cpi->gf_group.update_type[cpi->gf_group.index] == INTNL_ARF_UPDATE;
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
 
   if (cpi->oxcf.mode == REALTIME) return;
 
@@ -1832,7 +1965,6 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
     // qindex_thresh for resolution < 720p
     const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
     if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
-      sf->inter_sf.skip_repeated_newmv = 1;
       sf->part_sf.simple_motion_search_split =
           cm->features.allow_screen_content_tools ? 1 : 2;
       sf->part_sf.simple_motion_search_early_term_none = 1;
@@ -1849,7 +1981,6 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
       sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
       sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
       sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
-      sf->inter_sf.skip_repeated_newmv = 1;
       sf->tx_sf.model_based_prune_tx_search_level = 0;
 
       if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
@@ -1866,28 +1997,25 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
     }
   }
 
-  if (speed >= 3) {
-    // Disable extended partitions for lower quantizers
-    const int qindex_thresh =
-        cm->features.allow_screen_content_tools ? 50 : 100;
-    if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) {
-      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
-    }
-  }
-
-  if (speed >= 4) {
+  if (speed >= 2) {
     // Disable extended partitions for lower quantizers
-    const int qindex_thresh = boosted ? 80 : 120;
-    if (cm->quant_params.base_qindex <= qindex_thresh &&
-        !frame_is_intra_only(&cpi->common)) {
-      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    const int aggr = AOMMIN(3, speed - 2);
+    const int qindex_thresh1[4] = { 50, 50, 80, 100 };
+    const int qindex_thresh2[4] = { 80, 100, 120, 160 };
+    int qindex_thresh;
+    int disable_ext_part;
+    if (aggr <= 1) {
+      const int qthresh2 =
+          (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
+      qindex_thresh = cm->features.allow_screen_content_tools
+                          ? qindex_thresh1[aggr]
+                          : qthresh2;
+      disable_ext_part = !boosted;
+    } else {
+      qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+      disable_ext_part = !frame_is_intra_only(cm);
     }
-  }
-
-  if (speed >= 5) {
-    const int qindex_thresh = boosted ? 100 : 160;
-    if (cm->quant_params.base_qindex <= qindex_thresh &&
-        !frame_is_intra_only(&cpi->common)) {
+    if (cm->quant_params.base_qindex <= qindex_thresh && disable_ext_part) {
       sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
diff --git a/third_party/libaom/source/libaom/av1/encoder/speed_features.h b/third_party/libaom/source/libaom/av1/encoder/speed_features.h
index 90765febfb..3cf4c3d10b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/speed_features.h
+++ b/third_party/libaom/source/libaom/av1/encoder/speed_features.h
@@ -287,17 +287,30 @@ enum {
   SUPERRES_AUTO_DUAL,  // Tries no superres and q-based superres ratios
   SUPERRES_AUTO_SOLO,  // Only apply the q-based superres ratio
 } UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
-
 /*!\endcond */
+
+/*!\enum INTERNAL_COST_UPDATE_TYPE
+ * \brief This enum decides internally how often to update the entropy costs
+ *
+ * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly
+ * more flexibility in update frequency. This enum is separate from \ref
+ * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its
+ * values are public so it cannot be modified without breaking public API.
+ */
+typedef enum {
+  INTERNAL_COST_UPD_OFF,       /*!< Turn off cost updates. */
+  INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */
+  INTERNAL_COST_UPD_SBROW,     /*!< Update every sb rows inside a tile. */
+  INTERNAL_COST_UPD_SB,        /*!< Update every sb. */
+} INTERNAL_COST_UPDATE_TYPE;
+
 /*!
  * \brief Sequence/frame level speed vs quality features
  */
 typedef struct HIGH_LEVEL_SPEED_FEATURES {
-  /*!\cond */
-  // Frame level coding parameter update
+  /*! Frame level coding parameter update. */
   int frame_parameter_update;
 
-  /*!\endcond */
   /*!
    * Cases and frame types for which the recode loop is enabled.
    */
@@ -309,25 +322,27 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES {
    */
   int recode_tolerance;
 
-  /*!\cond */
-  // Determine how motion vector precision is chosen. The possibilities are:
-  // LAST_MV_DATA: use the mv data from the last coded frame
-  // CURRENT_Q: use the current q as a threshold
-  // QTR_ONLY: use quarter pel precision only.
+  /*!
+   * Determine how motion vector precision is chosen. The possibilities are:
+   * LAST_MV_DATA: use the mv data from the last coded frame
+   * CURRENT_Q: use the current q as a threshold
+   * QTR_ONLY: use quarter pel precision only.
+   */
   MV_PREC_LOGIC high_precision_mv_usage;
 
-  // Always set to 0. If on it enables 0 cost background transmission
-  // (except for the initial transmission of the segmentation). The feature is
-  // disabled because the addition of very large block sizes make the
-  // backgrounds very to cheap to encode, and the segmentation we have
-  // adds overhead.
+  /*!
+   * Always set to 0. If on it enables 0 cost background transmission
+   * (except for the initial transmission of the segmentation). The feature is
+   * disabled because the addition of very large block sizes make the
+   * backgrounds very to cheap to encode, and the segmentation we have
+   * adds overhead.
+   */
   int static_segmentation;
 
   /*!
    * Superres-auto mode search type:
    */
   SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
-  /*!\endcond */
 
   /*!
    * Enable/disable extra screen content test by encoding key frame twice.
@@ -340,10 +355,39 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES {
   int second_alt_ref_filtering;
 } HIGH_LEVEL_SPEED_FEATURES;
 
+/*!
+ * Speed features for the first pass.
+ */
+typedef struct FIRST_PASS_SPEED_FEATURES {
+  /*!
+   * \brief Reduces the mv search window.
+   * By default, the initial search window is around
+   * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023).
+   * Each step reduction decrease the window size by about a factor of 2.
+   */
+  int reduce_mv_step_param;
+
+  /*!
+   * \brief Skips the motion search when the zero mv has small sse.
+   */
+  int skip_motion_search_threshold;
+
+  /*!
+   * \brief Skips reconstruction by using source buffers for prediction
+   */
+  int disable_recon;
+} FIRST_PASS_SPEED_FEATURES;
+
 /*!\cond */
 typedef struct TPL_SPEED_FEATURES {
-  // Enable/disable GOP length adaptive decision.
-  int disable_gop_length_decision;
+  // GOP length adaptive decision.
+  // If set to 0, tpl model decides whether a shorter gf interval is better.
+  // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+  // (base+2) layer decide whether a shorter gf interval is better.
+  // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+  // decide whether a shorter gf interval is better.
+  // If set to 3, gop length adaptive decision is disabled.
+  int gop_length_decision_method;
   // Prune the intra modes search by tpl.
   // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
   // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
@@ -387,6 +431,10 @@ typedef struct GLOBAL_MOTION_SPEED_FEATURES {
   // given direction(past/future), if the evaluated ref_frame in that direction
   // yields gm_type as INVALID/TRANSLATION/IDENTITY
   int prune_ref_frame_for_gm_search;
+
+  // When the current GM type is set to ZEROMV, prune ZEROMV if its performance
+  // is worse than NEWMV under SSE metric.
+  int prune_zero_mv_with_sse;
 } GLOBAL_MOTION_SPEED_FEATURES;
 
 typedef struct PARTITION_SPEED_FEATURES {
@@ -511,6 +559,53 @@ typedef struct PARTITION_SPEED_FEATURES {
   // Prune rectangular split based on simple motion search split/no_split score.
   // 0: disable pruning, 1: enable pruning
   int simple_motion_search_rect_split;
+
+  // The current encoder adopts a DFS search for block partitions.
+  // Therefore the mode selection and associated rdcost is ready for smaller
+  // blocks before the mode selection for some partition types.
+  // AB partition could use previous rd information and skip mode search.
+  // An example is:
+  //
+  //  current block
+  //  +---+---+
+  //  |       |
+  //  +       +
+  //  |       |
+  //  +-------+
+  //
+  //  SPLIT partition has been searched first before trying HORZ_A
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //
+  //  HORZ_A
+  //  +---+---+
+  //  |   |   |
+  //  +---+---+
+  //  |       |
+  //  +-------+
+  //
+  //  With this speed feature, the top two sub blocks can directly use rdcost
+  //  searched in split partition, and the mode info is also copied from
+  //  saved info. Similarly, the bottom rectangular block can also use
+  //  the available information from previous rectangular search.
+  int reuse_prev_rd_results_for_part_ab;
+
+  // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+  // when encoding PARTITION_AB.
+  int reuse_best_prediction_for_part_ab;
+
+  // The current partition search records the best rdcost so far and uses it
+  // in mode search and transform search to early skip when some criteria is
+  // met. For example, when the current rdcost is larger than the best rdcost,
+  // or the model rdcost is larger than the best rdcost times some thresholds.
+  // By default, this feature is turned on to speed up the encoder partition
+  // search.
+  // If disabling it, at speed 0, 30 frames, we could get
+  // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown.
+  int use_best_rd_for_pruning;
 } PARTITION_SPEED_FEATURES;
 
 typedef struct MV_SPEED_FEATURES {
@@ -621,16 +716,19 @@ typedef struct INTER_MODE_SPEED_FEATURES {
 
   int alt_ref_search_fp;
 
-  // flag to skip NEWMV mode in drl if the motion search result is the same
-  int skip_repeated_newmv;
-
-  // Skip the current ref_mv in NEW_MV mode if we have already encountered
-  // another ref_mv in the drl such that:
-  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
-  //     search process as the current fullpel_mv.
-  //  2. The rate needed to encode the current fullpel_mv is larger than that
-  //     for the other ref_mv.
-  int skip_repeated_full_newmv;
+  // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc.
+  // This speed feature equaling 0 means no skipping.
+  // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode
+  // if we have already encountered ref_mv in the drl such that:
+  //  1. The other drl has the same mv during the SIMPLE_TRANSLATION search
+  //     process as the current mv.
+  //  2. The rate needed to encode the current mv is larger than that for the
+  //     other ref_mv.
+  // The speed feature equaling 1 means using subpel mv in the comparison.
+  // The speed feature equaling 2 means using fullpel mv in the comparison.
+  // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on
+  // known full_mv bestsme and drl cost.
+  int skip_newmv_in_drl;
 
   // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
   // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
@@ -677,12 +775,14 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // the single reference modes, it is one of the two best performers.
   int prune_compound_using_single_ref;
 
-  // Skip extended compound mode using ref frames of above and left neighbor
+  // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV,
+  // NEW_NEARMV) using ref frames of above and left neighbor
   // blocks.
   // 0 : no pruning
-  // 1 : prune extended compound mode (less aggressiveness)
-  // 2 : prune extended compound mode (high aggressiveness)
-  int prune_compound_using_neighbors;
+  // 1 : prune ext compound modes using neighbor blocks (less aggressiveness)
+  // 2 : prune ext compound modes using neighbor blocks (high aggressiveness)
+  // 3 : prune ext compound modes unconditionally (highest aggressiveness)
+  int prune_ext_comp_using_neighbors;
 
   // Skip extended compound mode when ref frame corresponding to NEWMV does not
   // have NEWMV as single mode winner.
@@ -722,12 +822,15 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // Decide when and how to use joint_comp.
   DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
 
-  // To skip cost update for mv.
-  // mv_cost_upd_level indicates the aggressiveness of skipping.
-  // 0: update happens at each sb level.
-  // 1: update happens once for each sb row.
-  // 2: update happens once for a set of rows.
-  int mv_cost_upd_level;
+  // Clip the frequency of updating the mv cost.
+  INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level;
+
+  // Clip the frequency of updating the coeff cost.
+  INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level;
+
+  // Clip the frequency of updating the mode cost.
+  INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level;
+
   // Prune inter modes based on tpl stats
   // 0 : no pruning
   // 1 - 3 indicate increasing aggressiveness in order.
@@ -750,15 +853,17 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // Enable/disable masked compound.
   int disable_masked_comp;
 
-  // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
-  // when encoding PARTITION_AB.
-  int reuse_best_prediction_for_part_ab;
-
   // Enable/disable the fast compound mode search.
   int enable_fast_compound_mode_search;
 
   // Reuse masked compound type search results
   int reuse_mask_search_results;
+
+  // Enable/disable fast search for wedge masks
+  int enable_fast_wedge_mask_search;
+
+  // Early breakout from transform search of inter modes
+  int inter_mode_txfm_breakout;
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {
@@ -808,8 +913,11 @@ typedef struct INTRA_MODE_SPEED_FEATURES {
   // Enable/disable smooth intra modes.
   int disable_smooth_intra;
 
-  // Enable/disable filter intra modes.
-  int disable_filter_intra;
+  // Prune filter intra modes in intra frames.
+  // 0 : No pruning
+  // 1 : Evaluate applicable filter intra modes based on best intra mode so far
+  // 2 : Do not evaluate filter intra modes
+  int prune_filter_intra_level;
 
   // prune palette search
   // 0: No pruning
@@ -825,6 +933,27 @@ typedef struct INTRA_MODE_SPEED_FEATURES {
   // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED,
   // UV_CFL_PRED and the mode that corresponds to luma intra mode winner.
   int prune_chroma_modes_using_luma_winner;
+
+  // Clip the frequency of updating the mv cost for intrabc.
+  INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level;
+
+  // We use DCT_DCT transform followed by computing SATD (Sum of Absolute
+  // Transformed Differences) as an estimation of RD score to quickly find the
+  // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search
+  // near the best possible parameter. The search range is set here.
+  // The range of cfl_searh_range should be [1, 33], and the following are the
+  // recommended values.
+  // 1: Fastest mode.
+  // 3: Default mode that provides good speedup without losing compression
+  // performance at speed 0.
+  // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only
+  // be used for debugging purpose.
+  int cfl_search_range;
+
+  // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in
+  // intra mode decision. Here, add a speed feature to reduce this number for
+  // higher speeds.
+  int top_intra_model_count_allowed;
 } INTRA_MODE_SPEED_FEATURES;
 
 typedef struct TX_SPEED_FEATURES {
@@ -1082,6 +1211,11 @@ typedef struct REAL_TIME_SPEED_FEATURES {
 
   // Skips mode checks more agressively in nonRD mode
   int nonrd_agressive_skip;
+
+  // Skip cdef on 64x64 blocks when NEWMV or INTRA is not picked or color
+  // sensitivity is off. When color sensitivity is on for a superblock, all
+  // 64x64 blocks within will not skip.
+  int skip_cdef_sb;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
@@ -1096,6 +1230,11 @@ typedef struct SPEED_FEATURES {
   HIGH_LEVEL_SPEED_FEATURES hl_sf;
 
   /*!
+   * Speed features for the first pass.
+   */
+  FIRST_PASS_SPEED_FEATURES fp_sf;
+
+  /*!
    * Speed features related to how tpl's searches are done.
    */
   TPL_SPEED_FEATURES tpl_sf;
diff --git a/third_party/libaom/source/libaom/av1/encoder/superres_scale.c b/third_party/libaom/source/libaom/av1/encoder/superres_scale.c
index bcd3fefdfe..283faabe61 100644
--- a/third_party/libaom/source/libaom/av1/encoder/superres_scale.c
+++ b/third_party/libaom/source/libaom/av1/encoder/superres_scale.c
@@ -80,7 +80,7 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
   if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
 
-  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR;
   switch (resize_cfg->resize_mode) {
     case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
     case RESIZE_FIXED:
@@ -109,12 +109,13 @@ int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) {
 #define SUPERRES_ENERGY_BY_AC_THRESH 0.2
 
 static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
-                                      const RATE_CONTROL *rc) {
+                                      const RATE_CONTROL *rc,
+                                      int gf_frame_index) {
   // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
   // level.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+  if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) {
     return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
-  } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
+  } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) {
     if (rc->frames_to_key <= 1)
       return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
     else
@@ -142,15 +143,15 @@ static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
 static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
                                              int sr_kf, int sr_arf) {
   // Use superres for Key-frames and Alt-ref frames only.
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (gf_group->update_type[gf_group->index] != KF_UPDATE &&
-      gf_group->update_type[gf_group->index] != ARF_UPDATE) {
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) {
     return SCALE_NUMERATOR;
   }
-  if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) {
+  if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) {
     return SCALE_NUMERATOR;
   }
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) {
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) {
     return SCALE_NUMERATOR;
   }
 
@@ -158,7 +159,7 @@ static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
   analyze_hor_freq(cpi, energy);
 
   const double energy_by_q2_thresh =
-      get_energy_by_q2_thresh(gf_group, &cpi->rc);
+      get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index);
   int denom = get_superres_denom_from_qindex_energy(
       qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
   /*
@@ -166,8 +167,8 @@ static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
   for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
   printf("]\n");
   printf("boost = %d\n",
-         (gf_group->update_type[gf_group->index] == KF_UPDATE)
-             ? cpi->rc.kf_boost
+         (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE)
+             ? cpi->ppi->p_rc.kf_boost
              : cpi->rc.gfu_boost);
   printf("denom = %d\n", denom);
   */
@@ -194,8 +195,8 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
   // Make sure that superres mode of the frame is consistent with the
   // sequence-level flag.
   assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
-                 cpi->common.seq_params.enable_superres));
-  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 cpi->common.seq_params->enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params->enable_superres,
                  superres_cfg->superres_mode == AOM_SUPERRES_NONE));
   // Make sure that superres mode for current encoding is consistent with user
   // provided superres mode.
@@ -222,8 +223,8 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
       // Now decide the use of superres based on 'q'.
       int bottom_index, top_index;
       const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height,
-          cpi->gf_group.index, &bottom_index, &top_index);
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
 
       const int qthresh = (frame_is_intra_only(&cpi->common))
                               ? superres_cfg->superres_kf_qthresh
@@ -243,8 +244,8 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
       // Now decide the use of superres based on 'q'.
       int bottom_index, top_index;
       const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height,
-          cpi->gf_group.index, &bottom_index, &top_index);
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
 
       const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
           cpi->sf.hl_sf.superres_auto_search_type;
@@ -345,7 +346,7 @@ static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
   size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
                            SCALE_NUMERATOR };
   int resize_denom = SCALE_NUMERATOR;
-  if (has_no_stats_stage(cpi) && cpi->use_svc &&
+  if (has_no_stats_stage(cpi) && cpi->ppi->use_svc &&
       cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
     rsz.resize_width = cpi->common.width;
     rsz.resize_height = cpi->common.height;
diff --git a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c
index 17109201e6..5cff958a85 100644
--- a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c
+++ b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c
@@ -30,6 +30,7 @@ void av1_init_layer_context(AV1_COMP *const cpi) {
   svc->current_superframe = 0;
   svc->force_zero_mode_spatial_ref = 1;
   svc->num_encoded_top_layer = 0;
+  svc->use_flexible_mode = 0;
 
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
     for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
@@ -90,6 +91,7 @@ void av1_init_layer_context(AV1_COMP *const cpi) {
 void av1_update_layer_context_change_config(AV1_COMP *const cpi,
                                             const int64_t target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SVC *const svc = &cpi->svc;
   int layer = 0;
   int64_t spatial_layer_target = 0;
@@ -106,17 +108,18 @@ void av1_update_layer_context_change_config(AV1_COMP *const cpi,
       LAYER_CONTEXT *const lc =
           &svc->layer_context[sl * svc->number_temporal_layers + tl];
       RATE_CONTROL *const lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
       lc->spatial_layer_target_bandwidth = spatial_layer_target;
       bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
-      lrc->starting_buffer_level =
-          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-      lrc->optimal_buffer_level =
-          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-      lrc->maximum_buffer_size =
-          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+      lp_rc->starting_buffer_level =
+          (int64_t)(p_rc->starting_buffer_level * bitrate_alloc);
+      lp_rc->optimal_buffer_level =
+          (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc);
+      lp_rc->maximum_buffer_size =
+          (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc);
       lrc->bits_off_target =
-          AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-      lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+          AOMMIN(lrc->bits_off_target, lp_rc->maximum_buffer_size);
+      lrc->buffer_level = AOMMIN(lrc->buffer_level, lp_rc->maximum_buffer_size);
       lc->framerate = cpi->framerate / lc->framerate_factor;
       lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
@@ -164,7 +167,6 @@ void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
 }
 
 void av1_restore_layer_context(AV1_COMP *const cpi) {
-  GF_GROUP *const gf_group = &cpi->gf_group;
   SVC *const svc = &cpi->svc;
   const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
@@ -172,8 +174,9 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   const int old_frame_to_key = cpi->rc.frames_to_key;
   // Restore layer rate control.
   cpi->rc = lc->rc;
+  cpi->ppi->p_rc = lc->p_rc;
   cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
-  gf_group->index = 0;
+  cpi->gf_frame_index = 0;
   cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
   if (cpi->mv_search_params.max_mv_magnitude == 0)
     cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
@@ -198,7 +201,7 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   // This is to skip searching mv for that reference if it was last
   // refreshed (i.e., buffer slot holding that reference was refreshed) on the
   // previous spatial layer(s) at the same time (current_superframe).
-  if (svc->external_ref_frame_config && svc->force_zero_mode_spatial_ref) {
+  if (svc->set_ref_frame_config && svc->force_zero_mode_spatial_ref) {
     int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1];
     if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
         svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1)
@@ -211,13 +214,13 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
 }
 
 void av1_save_layer_context(AV1_COMP *const cpi) {
-  GF_GROUP *const gf_group = &cpi->gf_group;
   SVC *const svc = &cpi->svc;
   const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *lc = get_layer_context(cpi);
   lc->rc = cpi->rc;
+  lc->p_rc = cpi->ppi->p_rc;
   lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
-  lc->group_index = gf_group->index;
+  lc->group_index = cpi->gf_frame_index;
   lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
   if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
@@ -243,7 +246,7 @@ void av1_save_layer_context(AV1_COMP *const cpi) {
       svc->buffer_time_index[i] = svc->current_superframe;
       svc->buffer_spatial_layer[i] = svc->spatial_layer_id;
     }
-  } else if (cpi->svc.external_ref_frame_config) {
+  } else if (cpi->svc.set_ref_frame_config) {
     for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
       int ref_frame_map_idx = svc->ref_idx[i];
       if (cpi->svc.refresh[ref_frame_map_idx]) {
@@ -342,3 +345,171 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
   cpi->common.height = height;
   av1_update_frame_size(cpi);
 }
+
+enum {
+  SVC_LAST_FRAME = 0,
+  SVC_LAST2_FRAME,
+  SVC_LAST3_FRAME,
+  SVC_GOLDEN_FRAME,
+  SVC_BWDREF_FRAME,
+  SVC_ALTREF2_FRAME,
+  SVC_ALTREF_FRAME
+};
+
+// For fixed svc mode: fixed pattern is set based on the number of
+// spatial and temporal layers, and the ksvc_fixed_mode.
+void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+  assert(svc->use_flexible_mode == 0);
+  // Fixed SVC mode only supports at most 3 spatial or temporal layers.
+  assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 &&
+         svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
+  svc->set_ref_frame_config = 1;
+  int superframe_cnt = svc->current_superframe;
+  // Set the reference map buffer idx for the 7 references:
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->reference[i] = 0;
+  for (i = 0; i < REF_FRAMES; i++) svc->refresh[i] = 0;
+  // Always reference LAST, and reference GOLDEN on SL > 0.
+  // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
+  // when frame_type is set.
+  svc->reference[SVC_LAST_FRAME] = 1;
+  if (svc->spatial_layer_id > 0) svc->reference[SVC_GOLDEN_FRAME] = 1;
+  if (svc->temporal_layer_id == 0) {
+    // Base temporal layer.
+    if (svc->spatial_layer_id == 0) {
+      // Set all buffer_idx to 0. Update slot 0 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->refresh[0] = 1;
+    } else if (svc->spatial_layer_id == 1) {
+      // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to
+      // slot 0. Update slot 1 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      svc->refresh[1] = 1;
+    } else if (svc->spatial_layer_id == 2) {
+      // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to
+      // slot 1. Update slot 2 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 1;
+      svc->ref_idx[SVC_LAST_FRAME] = 2;
+      svc->refresh[2] = 1;
+    }
+  } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) {
+    // First top temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Reference LAST (slot 0).
+      // Set GOLDEN to slot 3 and update slot 3.
+      // Set all other buffer_idx to slot 0.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        svc->refresh[3] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+      // GOLDEN (and all other refs) to slot 3.
+      // Set LAST2 to slot 4 and Update slot 4.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 3;
+      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_LAST2_FRAME] = 4;
+        svc->refresh[4] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+      // GOLDEN (and all other refs) to slot 4.
+      // No update.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 4;
+      svc->ref_idx[SVC_LAST_FRAME] = 2;
+    }
+  } else if (svc->temporal_layer_id == 1) {
+    // Middle temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Reference LAST.
+      // Set all buffer_idx to 0.
+      // Set GOLDEN to slot 5 and update slot 5.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        svc->ref_idx[SVC_GOLDEN_FRAME] = 5;
+        svc->refresh[5] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+      // GOLDEN (and all other refs) to slot 5.
+      // Set LAST3 to slot 6 and update slot 6.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 5;
+      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        svc->ref_idx[SVC_LAST3_FRAME] = 6;
+        svc->refresh[6] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+      // GOLDEN (and all other refs) to slot 6.
+      // Set LAST3 to slot 7 and update slot 7.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 6;
+      svc->ref_idx[SVC_LAST_FRAME] = 2;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        svc->ref_idx[SVC_LAST3_FRAME] = 7;
+        svc->refresh[7] = 1;
+      }
+    }
+  } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) {
+    // Second top temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Set LAST to slot 5 and reference LAST.
+      // Set GOLDEN to slot 3 and update slot 3.
+      // Set all other buffer_idx to 0.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 5;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        svc->refresh[3] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+      // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 6;
+      svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_LAST2_FRAME] = 4;
+        svc->refresh[4] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+      // GOLDEN to slot 4. No update.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 7;
+      svc->ref_idx[SVC_GOLDEN_FRAME] = 4;
+    }
+  }
+}
+
+void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth > (3 * lrc->prev_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->prev_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc2 = &svc->layer_context[layer2];
+        RATE_CONTROL *lrc2 = &lc2->rc;
+        PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc;
+        lrc2->rc_1_frame = 0;
+        lrc2->rc_2_frame = 0;
+        lrc2->bits_off_target = lp_rc->optimal_buffer_level;
+        lrc2->buffer_level = lp_rc->optimal_buffer_level;
+      }
+    }
+  }
+}
diff --git a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h
index 1eeba5e273..817e3620b0 100644
--- a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h
+++ b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h
@@ -26,6 +26,7 @@ extern "C" {
 typedef struct {
   /*!\cond */
   RATE_CONTROL rc;
+  PRIMARY_RATE_CONTROL p_rc;
   int framerate_factor;
   int64_t layer_target_bitrate;
   int scaling_factor_num;
@@ -94,8 +95,10 @@ typedef struct SVC {
   int temporal_layer_id;
   int number_spatial_layers;
   int number_temporal_layers;
-  int external_ref_frame_config;
+  int set_ref_frame_config;
   int non_reference_frame;
+  int use_flexible_mode;
+  int ksvc_fixed_mode;
   /*!\endcond */
 
   /*!
@@ -271,6 +274,11 @@ int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
 void av1_get_layer_resolution(const int width_org, const int height_org,
                               const int num, const int den, int *width_out,
                               int *height_out);
+
+void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
+
+void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c
index 676e110e60..6833ac8a40 100644
--- a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c
+++ b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c
@@ -155,7 +155,7 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
     best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
     best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
     const int mv_offset = mv_row * y_stride + mv_col;
-    error = cpi->fn_ptr[block_size].vf(
+    error = cpi->ppi->fn_ptr[block_size].vf(
         ref_frame->y_buffer + y_offset + mv_offset, y_stride,
         frame_to_filter->y_buffer + y_offset, y_stride, &sse);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
@@ -561,9 +561,16 @@ void av1_apply_temporal_filter_c(
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
   // Decay factors for non-local mean approach.
   double decay_factor[MAX_MB_PLANE] = { 0 };
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -745,10 +752,19 @@ static void tf_normalize_filtered_frame(
 }
 
 int av1_get_q(const AV1_COMP *cpi) {
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  const FRAME_TYPE frame_type = gf_group->frame_type[gf_group->index];
-  const int q = (int)av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  int avg_frame_qindex;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  avg_frame_qindex =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? cpi->ppi->temp_avg_frame_qindex[frame_type]
+          : cpi->rc.avg_frame_qindex[frame_type];
+#else
+  avg_frame_qindex = cpi->rc.avg_frame_qindex[frame_type];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  const int q = (int)av1_convert_qindex_to_q(avg_frame_qindex,
+                                             cpi->common.seq_params->bit_depth);
   return q;
 }
 
@@ -855,23 +871,24 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
       }
     }
     tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
-                                accum, count, &cpi->alt_ref_buffer);
+                                accum, count, &cpi->ppi->alt_ref_buffer);
 
     if (check_show_existing) {
       const int y_height = mb_height >> mbd->plane[0].subsampling_y;
       const int y_width = mb_width >> mbd->plane[0].subsampling_x;
       const int source_y_stride = frame_to_filter->y_stride;
-      const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
+      const int filter_y_stride = cpi->ppi->alt_ref_buffer.y_stride;
       const int source_offset =
           mb_row * y_height * source_y_stride + mb_col * y_width;
       const int filter_offset =
           mb_row * y_height * filter_y_stride + mb_col * y_width;
       unsigned int sse = 0;
-      cpi->fn_ptr[block_size].vf(
+      cpi->ppi->fn_ptr[block_size].vf(
           frame_to_filter->y_buffer + source_offset, source_y_stride,
-          cpi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride, &sse);
+          cpi->ppi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride,
+          &sse);
       diff->sum += sse;
-      diff->sse += sse * sse;
+      diff->sse += sse * (int64_t)sse;
     }
   }
 }
@@ -939,8 +956,9 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
   const int lookahead_depth =
       av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
 
-  int arf_src_offset = cpi->gf_group.arf_src_offset[cpi->gf_group.index];
-  const FRAME_TYPE frame_type = cpi->gf_group.frame_type[cpi->gf_group.index];
+  int arf_src_offset = cpi->ppi->gf_group.arf_src_offset[cpi->gf_frame_index];
+  const FRAME_TYPE frame_type =
+      cpi->ppi->gf_group.frame_type[cpi->gf_frame_index];
 
   // Temporal filtering should not go beyond key frames
   const int key_to_curframe =
@@ -949,10 +967,10 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
       AOMMAX(cpi->rc.frames_to_key - arf_src_offset - 1, 0);
 
   // Number of buffered frames before the to-filter frame.
-  const int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
+  int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
 
   // Number of buffered frames after the to-filter frame.
-  const int max_after =
+  int max_after =
       AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
 
   // Estimate noises for each plane.
@@ -964,26 +982,34 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
   double *noise_levels = tf_ctx->noise_levels;
   for (int plane = 0; plane < num_planes; ++plane) {
     noise_levels[plane] = av1_estimate_noise_from_single_plane(
-        to_filter_frame, plane, cpi->common.seq_params.bit_depth);
+        to_filter_frame, plane, cpi->common.seq_params->bit_depth);
   }
   // Get quantization factor.
   const int q = av1_get_q(cpi);
-  // Get correlation estimates from first-pass
-  RATE_CONTROL *rc = &cpi->rc;
-  const double *coeff = rc->cor_coeff;
-  const int offset = rc->regions_offset;
-  int cur_frame_idx =
-      filter_frame_lookahead_idx + rc->frames_since_key - offset;
-
+  // Get correlation estimates from first-pass;
+  const FIRSTPASS_STATS *stats =
+      cpi->ppi->twopass.stats_in - (cpi->rc.frames_since_key == 0);
   double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
   for (int i = 1; i <= max_after; i++) {
-    accu_coeff1 *= coeff[cur_frame_idx + i];
+    if (stats + filter_frame_lookahead_idx + i >=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+      max_after = i - 1;
+      break;
+    }
+    accu_coeff1 *=
+        AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
   }
   if (max_after >= 1) {
     accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
   }
   for (int i = 1; i <= max_before; i++) {
-    accu_coeff0 *= coeff[cur_frame_idx - i + 1];
+    if (stats + filter_frame_lookahead_idx - i + 1 <=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
+      max_before = i - 1;
+      break;
+    }
+    accu_coeff0 *=
+        AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
   }
   if (max_before >= 1) {
     accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
@@ -1008,7 +1034,7 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
     num_before = AOMMIN(num_frames - 1, max_before);
     num_after = 0;
   } else {
-    num_frames = AOMMIN(num_frames, cpi->rc.gfu_boost / 150);
+    num_frames = AOMMIN(num_frames, cpi->ppi->p_rc.gfu_boost / 150);
     num_frames += !(num_frames & 1);  // Make the number odd.
     // Only use 2 neighbours for the second ARF.
     if (is_second_arf) num_frames = AOMMIN(num_frames, 3);
@@ -1051,10 +1077,10 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
   assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
 
   av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
-                       cpi->common.seq_params.sb_size);
+                       cpi->common.seq_params->sb_size);
   av1_setup_block_planes(&cpi->td.mb.e_mbd,
-                         cpi->common.seq_params.subsampling_x,
-                         cpi->common.seq_params.subsampling_y, num_planes);
+                         cpi->common.seq_params->subsampling_x,
+                         cpi->common.seq_params->subsampling_y, num_planes);
 }
 
 /*!\cond */
@@ -1174,8 +1200,8 @@ int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
                         int *show_existing_arf) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   // Basic informaton of the current frame.
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const uint8_t group_idx = gf_group->index;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const uint8_t group_idx = cpi->gf_frame_index;
   TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
   TemporalFilterData *tf_data = &cpi->td.tf_data;
   // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
@@ -1236,9 +1262,9 @@ int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
     int top_index = 0;
     int bottom_index = 0;
     const int q = av1_rc_pick_q_and_bounds(
-        cpi, &cpi->rc, cpi->oxcf.frm_dim_cfg.width,
-        cpi->oxcf.frm_dim_cfg.height, group_idx, &bottom_index, &top_index);
-    const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params.bit_depth);
+        cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+        group_idx, &bottom_index, &top_index);
+    const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params->bit_depth);
     const float threshold = 0.7f * ac_q * ac_q;
 
     if (!is_second_arf) {
diff --git a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h
index 2ae7dd4bda..3b9563755c 100644
--- a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h
+++ b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h
@@ -64,6 +64,14 @@ struct ThreadData;
 //    then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
 //    for 360p videos will be 360 * 0.1 = 36.
 #define TF_SEARCH_DISTANCE_THRESHOLD 0.1
+// 6. Threshold to identify if the q is in a relative high range.
+//    Above this cutoff q, a stronger filtering is applied.
+//    For a high q, the quantization throws away more information, and thus a
+//    stronger filtering is less likely to distort the encoded quality, while a
+//    stronger filtering could reduce bit rates.
+//    Ror a low q, more details are expected to be retained. Filtering is thus
+//    more conservative.
+#define TF_QINDEX_CUTOFF 128
 
 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50
 
@@ -276,11 +284,6 @@ static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
   aom_free(tf_data->pred);
 }
 
-// Helper function to compute number of blocks on either side of the frame.
-static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
-  return (frame_length + mb_length - 1) / mb_length;
-}
-
 // Saves the state prior to temporal filter process.
 // Inputs:
 //   mbd: Pointer to the block for filtering.
diff --git a/third_party/libaom/source/libaom/av1/encoder/tokenize.c b/third_party/libaom/source/libaom/av1/encoder/tokenize.c
index bc63cc00ae..7e16b29a9a 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tokenize.c
+++ b/third_party/libaom/source/libaom/av1/encoder/tokenize.c
@@ -155,16 +155,18 @@ static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int step = bsw * bsh;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
 
     assert(bsw > 0 && bsh > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
 
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
         tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
                        arg);
         block += step;
diff --git a/third_party/libaom/source/libaom/av1/encoder/tokenize.h b/third_party/libaom/source/libaom/av1/encoder/tokenize.h
index 51eb28cee6..f31dc96958 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tokenize.h
+++ b/third_party/libaom/source/libaom/av1/encoder/tokenize.h
@@ -119,8 +119,8 @@ static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
 // Allocate memory for token related info.
 static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info) {
   int mi_rows_aligned_to_sb =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params->mib_size_log2;
   const int num_planes = av1_num_planes(cm);
   unsigned int tokens =
       get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
diff --git a/third_party/libaom/source/libaom/av1/encoder/tpl_model.c b/third_party/libaom/source/libaom/av1/encoder/tpl_model.c
index 6ae957d4e5..e07ab3e311 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tpl_model.c
+++ b/third_party/libaom/source/libaom/av1/encoder/tpl_model.c
@@ -35,38 +35,48 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
 
-static AOM_INLINE int tpl_use_multithread(const AV1_COMP *cpi) {
-  return cpi->mt_info.num_workers > 1 && !cpi->sf.tpl_sf.allow_compound_pred;
+static INLINE double exp_bounded(double v) {
+  // When v > 700 or <-700, the exp function will be close to overflow
+  // For details, see the "Notes" in the following link.
+  // https://en.cppreference.com/w/c/numeric/math/exp
+  if (v > 700) {
+    return DBL_MAX;
+  } else if (v < -700) {
+    return 0;
+  }
+  return exp(v);
 }
 
-static AOM_INLINE void tpl_stats_record_txfm_block(TplDepFrame *tpl_frame,
-                                                   const tran_low_t *coeff) {
-  aom_clear_system_state();
-  // For transform larger than 16x16, the scale of coeff need to be adjusted.
-  // It's not LOSSLESS_Q_STEP.
-  assert(tpl_frame->coeff_num <= 256);
-  for (int i = 0; i < tpl_frame->coeff_num; ++i) {
-    tpl_frame->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) {
+  tpl_txfm_stats->coeff_num = 256;
+  tpl_txfm_stats->txfm_block_count = 0;
+  memset(tpl_txfm_stats->abs_coeff_sum, 0,
+         sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num);
+}
+
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats) {
+  accumulated_stats->txfm_block_count += sub_stats->txfm_block_count;
+  for (int i = 0; i < accumulated_stats->coeff_num; ++i) {
+    accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i];
   }
-  ++tpl_frame->txfm_block_count;
 }
 
-static AOM_INLINE void tpl_stats_update_abs_coeff_mean(TplDepFrame *tpl_frame) {
-  aom_clear_system_state();
-  for (int i = 0; i < tpl_frame->coeff_num; ++i) {
-    tpl_frame->abs_coeff_mean[i] =
-        tpl_frame->abs_coeff_sum[i] / tpl_frame->txfm_block_count;
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff) {
+  // For transform larger than 16x16, the scale of coeff need to be adjusted.
+  // It's not LOSSLESS_Q_STEP.
+  assert(tpl_txfm_stats->coeff_num <= 256);
+  for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) {
+    tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
   }
+  ++tpl_txfm_stats->txfm_block_count;
 }
 
-void av1_tpl_stats_init_txfm_stats(TplDepFrame *tpl_frame, int tpl_bsize_1d) {
-  aom_clear_system_state();
-  tpl_frame->txfm_block_count = 0;
-  tpl_frame->coeff_num = tpl_bsize_1d * tpl_bsize_1d;
-  memset(tpl_frame->abs_coeff_sum, 0, sizeof(tpl_frame->abs_coeff_sum));
-  assert(sizeof(tpl_frame->abs_coeff_mean) /
-             sizeof(tpl_frame->abs_coeff_mean[0]) ==
-         tpl_frame->coeff_num);
+static AOM_INLINE void av1_tpl_store_txfm_stats(
+    TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
+    const int frame_index) {
+  tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
 }
 
 static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
@@ -118,9 +128,11 @@ static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
   assert(*tpl_bsize_1d >= 16);
 }
 
-void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data,
-                           int lag_in_frames) {
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
+void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  TplParams *const tpl_data = &ppi->tpl_data;
   set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
                            &tpl_data->tpl_bsize_1d);
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
@@ -139,7 +151,6 @@ void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data,
     tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width;
     tpl_frame->mi_rows = mi_params->mi_rows;
     tpl_frame->mi_cols = mi_params->mi_cols;
-    av1_tpl_stats_init_txfm_stats(tpl_frame, tpl_data->tpl_bsize_1d);
   }
   tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
 
@@ -150,47 +161,33 @@ void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data,
   // TODO(aomedia:2873): Explore the allocation of tpl buffers based on
   // lag_in_frames.
   for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    CHECK_MEM_ERROR(
-        cm, tpl_data->tpl_stats_pool[frame],
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, tpl_data->tpl_stats_pool[frame],
         aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
                        tpl_data->tpl_stats_buffer[frame].height,
                    sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
-    if (aom_alloc_frame_buffer(
-            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
-            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, tpl_data->border_in_pixels,
-            cm->features.byte_alignment))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+
+    if (aom_alloc_frame_buffer(&tpl_data->tpl_rec_pool[frame], width, height,
+                               seq_params->subsampling_x,
+                               seq_params->subsampling_y,
+                               seq_params->use_highbitdepth,
+                               tpl_data->border_in_pixels, byte_alignment))
+      aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate frame buffer");
   }
 }
 
-static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw,
-                                    tran_low_t *coeff, TX_SIZE tx_size,
-                                    int bit_depth, int is_hbd) {
-  TxfmParam txfm_param;
-  txfm_param.tx_type = DCT_DCT;
-  txfm_param.tx_size = tx_size;
-  txfm_param.lossless = 0;
-  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
-
-  txfm_param.bd = bit_depth;
-  txfm_param.is_hbd = is_hbd;
-  av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
-}
-
-static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x,
+static AOM_INLINE int64_t tpl_get_satd_cost(BitDepthInfo bd_info,
                                             int16_t *src_diff, int diff_stride,
                                             const uint8_t *src, int src_stride,
                                             const uint8_t *dst, int dst_stride,
                                             tran_low_t *coeff, int bw, int bh,
                                             TX_SIZE tx_size) {
-  const MACROBLOCKD *xd = &x->e_mbd;
   const int pix_num = bw * bh;
 
-  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
-                     dst_stride);
-  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
   return aom_satd(coeff, pix_num);
 }
 
@@ -198,7 +195,6 @@ static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
   const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
 
   assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
-  aom_clear_system_state();
   int rate_cost = 1;
 
   for (int idx = 0; idx < eob; ++idx) {
@@ -215,11 +211,11 @@ static AOM_INLINE void txfm_quant_rdcost(
     tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
     int *rate_cost, int64_t *recon_error, int64_t *sse) {
   const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   uint16_t eob;
-  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
-                     dst_stride);
-  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
-               is_cur_buf_hbd(xd));
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
 
   get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
                      sse);
@@ -316,13 +312,16 @@ static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs,
 }
 
 static void get_rate_distortion(
-    int *rate_cost, int64_t *recon_error, int16_t *src_diff, tran_low_t *coeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
+    int *rate_cost, int64_t *recon_error, int64_t *pred_error,
+    int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
     const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
     int mi_row, int mi_col, int use_y_only_rate_distortion) {
+  const SequenceHeader *seq_params = cm->seq_params;
   *rate_cost = 0;
   *recon_error = 1;
+  *pred_error = 1;
 
   MACROBLOCKD *xd = &x->e_mbd;
   int is_compound = (best_mode == NEW_NEWMV);
@@ -356,7 +355,8 @@ static void get_rate_distortion(
     for (int ref = 0; ref < 1 + is_compound; ++ref) {
       if (!is_inter_mode(best_mode)) {
         av1_predict_intra_block(
-            cm, xd, block_size_wide[bsize_plane], block_size_high[bsize_plane],
+            xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+            block_size_wide[bsize_plane], block_size_high[bsize_plane],
             max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0,
             FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer,
             dst_buffer_stride, 0, 0, plane);
@@ -405,21 +405,24 @@ static void get_rate_distortion(
         &this_rate, &this_recon_error, &sse);
 
     *recon_error += this_recon_error;
+    *pred_error += sse;
     *rate_cost += this_rate;
   }
 }
 
-static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                                       int mi_col, BLOCK_SIZE bsize,
-                                       TX_SIZE tx_size,
+static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
+                                       TplTxfmStats *tpl_txfm_stats,
+                                       MACROBLOCK *x, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, TX_SIZE tx_size,
                                        TplDepStats *tpl_stats) {
   AV1_COMMON *cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
 
   (void)gf_group;
 
   MACROBLOCKD *xd = &x->e_mbd;
-  TplParams *tpl_data = &cpi->tpl_data;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
@@ -471,6 +474,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   uint8_t *predictor =
       is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
   int64_t recon_error = 1;
+  int64_t pred_error = 1;
 
   memset(tpl_stats, 0, sizeof(*tpl_stats));
   tpl_stats->ref_frame_index[0] = -1;
@@ -493,7 +497,6 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   // Pre-load the bottom left line.
   if (xd->left_available &&
       mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
-#if CONFIG_AV1_HIGHBITDEPTH
     if (is_cur_buf_hbd(xd)) {
       uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
       for (int i = 0; i < bw; ++i)
@@ -504,26 +507,24 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
         dst_buffer[(bw + i) * dst_buffer_stride - 1] =
             dst_buffer[(bw - 1) * dst_buffer_stride - 1];
     }
-#else
-    for (int i = 0; i < bw; ++i)
-      dst_buffer[(bw + i) * dst_buffer_stride - 1] =
-          dst_buffer[(bw - 1) * dst_buffer_stride - 1];
-#endif
   }
 
   // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
   // H_PRED, and V_PRED
   const PREDICTION_MODE last_intra_mode =
       cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+  const SequenceHeader *seq_params = cm->seq_params;
   for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
        ++mode) {
-    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                            block_size_high[bsize], tx_size, mode, 0, 0,
-                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
-                            predictor, bw, 0, 0, 0);
+    av1_predict_intra_block(xd, seq_params->sb_size,
+                            seq_params->enable_intra_edge_filter,
+                            block_size_wide[bsize], block_size_high[bsize],
+                            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+                            dst_buffer_stride, predictor, bw, 0, 0, 0);
 
-    intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    intra_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
 
     if (intra_cost < best_intra_cost) {
       best_intra_cost = intra_cost;
@@ -607,7 +608,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
       for (idx = 0; idx < refmv_count; ++idx) {
         FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
         clamp_fullmv(&mv, &x->mv_limits);
-        center_mvs[idx].sad = (int)cpi->fn_ptr[bsize].sdf(
+        center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf(
             src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
             ref_stride);
       }
@@ -653,8 +654,9 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
     av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
                                       &inter_pred_params);
 
-    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
     // Store inter cost for each ref frame
     tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
 
@@ -732,8 +734,9 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
       av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv,
                                         &inter_pred_params);
     }
-    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
     if (inter_cost < best_inter_cost) {
       best_cmp_rf_idx = cmp_rf_idx;
       best_inter_cost = inter_cost;
@@ -760,8 +763,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
           : NULL,
     };
     int rate_cost = 1;
-    get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                        dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
@@ -772,7 +775,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
 
-  tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+  tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
 
   // Final encode
   int rate_cost = 0;
@@ -786,21 +790,19 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
       best_mode == NEW_NEWMV
           ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
           : NULL;
-  get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                      dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+  get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                      qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                       rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                       use_y_only_rate_distortion);
 
-  if (!tpl_use_multithread(cpi)) {
-    // TODO(angiebird): make this work for multithread
-    tpl_stats_record_txfm_block(tpl_frame, coeff);
-  }
+  av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
 
   tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
   tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
   if (!is_inter_mode(best_mode)) {
     tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
   }
 
   tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
@@ -810,8 +812,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
     ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
     ref_frame_ptr[1] =
         tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
-    get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                        dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion);
     tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
@@ -831,8 +833,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
     ref_frame_ptr[0] =
         tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
     ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
-    get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                        dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion);
     tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
@@ -887,41 +889,24 @@ static int round_floor(int ref_pos, int bsize_pix) {
   return round;
 }
 
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << mi_size_wide_log2[bsize];
-  int bh = 4 << mi_size_high_log2[bsize];
-
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height) {
+  int min_row = AOMMAX(row_a, row_b);
+  int max_row = AOMMIN(row_a + height, row_b + height);
+  int min_col = AOMMAX(col_a, col_b);
+  int max_col = AOMMIN(col_a + width, col_b + width);
+  if (min_row < max_row && min_col < max_col) {
+    return (max_row - min_row) * (max_col - min_col);
   }
-  int overlap_area = width * height;
-  return overlap_area;
+  return 0;
 }
 
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
   return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
 }
 
-static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
-                               int64_t srcrf_dist, int pix_num) {
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num) {
   double beta = (double)srcrf_dist / recrf_dist;
   int64_t rate_cost = delta_rate;
 
@@ -952,7 +937,6 @@ static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
 static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
                                           int mi_col, const BLOCK_SIZE bsize,
                                           int frame_idx, int ref) {
-  aom_clear_system_state();
   TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
   TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
   TplDepFrame *tpl_frame = tpl_data->tpl_frame;
@@ -998,8 +982,8 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
                  tpl_stats_ptr->recrf_dist));
   int64_t delta_rate = tpl_stats_ptr->recrf_rate - srcrf_rate;
   int64_t mc_dep_rate =
-      delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
-                      srcrf_dist, pix_num);
+      av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                          srcrf_dist, pix_num);
 
   for (block = 0; block < 4; ++block) {
     int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
@@ -1007,8 +991,8 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
 
     if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
         grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col,
+                                              ref_pos_row, ref_pos_col, bw, bh);
       int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
       int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
       assert((1 << block_mis_log2) == mi_height);
@@ -1043,6 +1027,7 @@ static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
   tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost);
   tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost);
   tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist);
+  tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse);
   tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist);
   tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate);
   tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate);
@@ -1068,12 +1053,12 @@ static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
 // Initialize the mc_flow parameters used in computing tpl data.
 static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
                                               int pframe_qindex) {
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
   const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
   uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
       gf_group, cpi->sf.inter_sf.selective_ref_frame,
       cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx);
@@ -1084,6 +1069,7 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
+  TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
   tpl_data->frame_idx = frame_idx;
   tpl_reset_src_ref_frames(tpl_data);
   av1_tile_init(&xd->tile, cm, 0, 0);
@@ -1161,18 +1147,21 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
 
   tpl_frame->base_rdmult =
       av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
 }
 
 // This function stores the motion estimation dependencies of all the blocks in
 // a row
-void av1_mc_flow_dispenser_row(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               BLOCK_SIZE bsize, TX_SIZE tx_size) {
+void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
+                               MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize,
+                               TX_SIZE tx_size) {
   AV1_COMMON *const cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_width = mi_size_wide[bsize];
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1194,7 +1183,8 @@ void av1_mc_flow_dispenser_row(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
     xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
     xd->mb_to_right_edge =
         GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
-    mode_estimation(cpi, x, mi_row, mi_col, bsize, tx_size, &tpl_stats);
+    mode_estimation(cpi, tpl_txfm_stats, x, mi_row, mi_col, bsize, tx_size,
+                    &tpl_stats);
 
     // Motion flow dependency dispenser.
     tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
@@ -1210,40 +1200,36 @@ static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  const BLOCK_SIZE bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = mi_size_high[bsize];
   for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
     av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
-                          cpi->tpl_data.border_in_pixels);
+                          cpi->ppi->tpl_data.border_in_pixels);
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size);
-  }
-  if (!tpl_use_multithread(cpi)) {
-    // TODO(angiebird): make this work for multithread
-    TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[cpi->tpl_data.frame_idx];
-    tpl_stats_update_abs_coeff_mean(tpl_frame);
+    av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, x, mi_row, bsize,
+                              tx_size);
   }
 }
 
-static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) {
-  AV1_COMMON *cm = &cpi->common;
-  TplParams *const tpl_data = &cpi->tpl_data;
-
+static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
+                                int mi_cols) {
+  if (!frame_idx) {
+    return;
+  }
   const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
   assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2));
   assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2));
 
-  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) {
-    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) {
-      if (frame_idx) {
-        tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
-      }
+  for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) {
+      tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
     }
   }
 }
@@ -1253,12 +1239,17 @@ static AOM_INLINE void init_gop_frames_for_tpl(
     GF_GROUP *gf_group, int gop_eval, int *tpl_group_frames,
     const EncodeFrameInput *const frame_input, int *pframe_qindex) {
   AV1_COMMON *cm = &cpi->common;
-  int cur_frame_idx = gf_group->index;
+  int cur_frame_idx = cpi->gf_frame_index;
   *pframe_qindex = 0;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(cpi, ref_frame_map_pairs);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
   RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack;
   EncodeFrameParams frame_params = *init_frame_params;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
 
   int ref_picture_map[REF_FRAMES];
 
@@ -1288,7 +1279,7 @@ static AOM_INLINE void init_gop_frames_for_tpl(
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
     FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
     int frame_display_index = gf_index == gf_group->size
-                                  ? cpi->rc.baseline_gf_interval
+                                  ? cpi->ppi->p_rc.baseline_gf_interval
                                   : gf_group->cur_frame_idx[gf_index] +
                                         gf_group->arf_src_offset[gf_index];
 
@@ -1317,7 +1308,7 @@ static AOM_INLINE void init_gop_frames_for_tpl(
     }
     if (gop_eval && cpi->rc.frames_since_key > 0 &&
         gf_group->arf_index == gf_index)
-      tpl_frame->gf_picture = &cpi->alt_ref_buffer;
+      tpl_frame->gf_picture = &cpi->ppi->alt_ref_buffer;
 
     // 'cm->current_frame.frame_number' is the display number
     // of the current frame.
@@ -1338,15 +1329,45 @@ static AOM_INLINE void init_gop_frames_for_tpl(
       tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
       ++process_frame_count;
     }
-
-    av1_get_ref_frames(cpi, &ref_buffer_stack);
-    int refresh_mask = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    av1_get_ref_frames(&ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                       cpi, ref_frame_map_pairs, true_disp,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                       cm->remapped_ref_idx);
+
+    int refresh_mask =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                    true_disp, ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                                    &ref_buffer_stack);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
                              frame_params.show_existing_frame,
                              refresh_frame_map_index, &ref_buffer_stack);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       tpl_frame->ref_map_index[i - LAST_FRAME] =
@@ -1360,8 +1381,9 @@ static AOM_INLINE void init_gop_frames_for_tpl(
   if (cpi->rc.frames_since_key == 0) return;
 
   int extend_frame_count = 0;
-  int extend_frame_length = AOMMIN(
-      MAX_TPL_EXTEND, cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
+  int extend_frame_length =
+      AOMMIN(MAX_TPL_EXTEND,
+             cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval);
   int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
                             gf_group->arf_src_offset[gop_length - 1] + 1;
 
@@ -1400,14 +1422,37 @@ static AOM_INLINE void init_gop_frames_for_tpl(
 
     gf_group->update_type[gf_index] = LF_UPDATE;
     gf_group->q_val[gf_index] = *pframe_qindex;
-
-    av1_get_ref_frames(cpi, &ref_buffer_stack);
-    int refresh_mask = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    av1_get_ref_frames(&ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                       cpi, ref_frame_map_pairs, true_disp,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                       cm->remapped_ref_idx);
+    int refresh_mask =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                    true_disp, ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                                    &ref_buffer_stack);
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
                              frame_params.show_existing_frame,
                              refresh_frame_map_index, &ref_buffer_stack);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       tpl_frame->ref_map_index[i - LAST_FRAME] =
@@ -1424,8 +1469,16 @@ static AOM_INLINE void init_gop_frames_for_tpl(
     ++extend_frame_count;
     ++frame_display_index;
   }
-
-  av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cur_frame_idx];
+  const int true_disp = (int)(tpl_frame->frame_display_index);
+  init_ref_map_pair(cpi, ref_frame_map_pairs);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  av1_get_ref_frames(&cpi->ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                     cpi, ref_frame_map_pairs, true_disp,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                     cm->remapped_ref_idx);
 }
 
 void av1_init_tpl_stats(TplParams *const tpl_data) {
@@ -1440,9 +1493,47 @@ void av1_init_tpl_stats(TplParams *const tpl_data) {
                sizeof(*tpl_frame->tpl_stats_ptr));
     tpl_frame->is_valid = 0;
   }
-  for (frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
-    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
-    av1_tpl_stats_init_txfm_stats(tpl_frame, tpl_data->tpl_bsize_1d);
+#if CONFIG_BITRATE_ACCURACY
+  tpl_data->estimated_gop_bitrate = 0;
+  tpl_data->actual_gop_bitrate = 0;
+#endif
+}
+
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+  switch (gop_eval) {
+    case 1:
+      // Allow larger GOP size if the base layer ARF has higher dependency
+      // factor than the intermediate ARF and both ARFs have reasonably high
+      // dependency factors.
+      return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
+    case 2:
+      if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+        return 1;  // Don't shorten the gf interval
+      else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+        return 0;  // Shorten the gf interval
+      else
+        return 2;  // Cannot decide the gf interval, so redo the
+                   // tpl stats calculation.
+    case 3: return beta[0] > 1.1;
+    default: return 2;
+  }
+}
+
+// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down
+// the scope of input arguments.
+void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
+                                 const EncodeFrameParams *const frame_params) {
+  AV1_COMMON *cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int bottom_index, top_index;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+    gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds(
+        cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index);
   }
 }
 
@@ -1455,10 +1546,17 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
   AV1_COMMON *cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
-  GF_GROUP *gf_group = &cpi->gf_group;
-  int bottom_index, top_index;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   EncodeFrameParams this_frame_params = *frame_params;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  int approx_gop_eval = (gop_eval > 1);
+  int num_arf_layers = MAX_ARF_LAYERS;
+
+  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+  // tpl stats calculation is limited to ARFs from base layer and (base+1)
+  // layer.
+  if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2;
 
   if (cpi->superres_mode != AOM_SUPERRES_NONE) {
     assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
@@ -1467,7 +1565,8 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
   }
 
   cm->current_frame.frame_type = frame_params->frame_type;
-  for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) {
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
     cm->current_frame.frame_type = gf_group->frame_type[gf_index];
     av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
                                  gf_group->update_type[gf_index],
@@ -1475,13 +1574,6 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
 
     memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
            sizeof(cpi->refresh_frame));
-
-    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
-                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
-
-    gf_group->q_val[gf_index] =
-        av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index,
-                                 &bottom_index, &top_index);
   }
 
   int pframe_qindex;
@@ -1489,7 +1581,7 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
   init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval,
                           &tpl_gf_group_frames, frame_input, &pframe_qindex);
 
-  cpi->rc.base_layer_qp = pframe_qindex;
+  cpi->ppi->p_rc.base_layer_qp = pframe_qindex;
 
   av1_init_tpl_stats(tpl_data);
 
@@ -1505,37 +1597,59 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
   av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
                     cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
 
+  const int gop_length = get_gop_length(gf_group);
   // Backward propagation from tpl_group_frames to 1.
-  for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames;
+  for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
        ++frame_idx) {
     if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
+    // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+    // frames and for frames beyond gop length.
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
     init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
-    if (tpl_use_multithread(cpi)) {
+    if (mt_info->num_workers > 1) {
       tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
       tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write;
       av1_mc_flow_dispenser_mt(cpi);
     } else {
       mc_flow_dispenser(cpi);
     }
+    av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx);
 
     aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
                              av1_num_planes(cm));
   }
 
-  for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= gf_group->index;
-       --frame_idx) {
+#if CONFIG_BITRATE_ACCURACY
+  tpl_data->estimated_gop_bitrate = av1_estimate_gop_bitrate(
+      gf_group->q_val, gf_group->size, tpl_data->txfm_stats_list);
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+      gop_eval == 0) {
+    printf("\nestimated bitrate: %f\n", tpl_data->estimated_gop_bitrate);
+  }
+#endif
+
+  for (int frame_idx = tpl_gf_group_frames - 1;
+       frame_idx >= cpi->gf_frame_index; --frame_idx) {
     if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
-    mc_flow_synthesizer(cpi, frame_idx);
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
+    mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
+                        cm->mi_params.mi_cols);
   }
 
   av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
-                               gf_group->update_type[gf_group->index],
+                               gf_group->update_type[cpi->gf_frame_index],
                                frame_params->frame_type, 0);
   cm->current_frame.frame_type = frame_params->frame_type;
   cm->show_frame = frame_params->show_frame;
@@ -1592,21 +1706,17 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_tpl_setup_stats_time);
 #endif
-
-  // Allow larger GOP size if the base layer ARF has higher dependency factor
-  // than the intermediate ARF and both ARFs have reasonably high dependency
-  // factors.
-  return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
+  return eval_gop_length(beta, gop_eval);
 }
 
 void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const int tpl_idx = gf_group->index;
+  const int tpl_idx = cpi->gf_frame_index;
 
-  assert(IMPLIES(gf_group->size > 0, tpl_idx < gf_group->size));
+  assert(
+      IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size));
 
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
 
   if (!tpl_frame->is_valid) return;
@@ -1623,8 +1733,6 @@ void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
   const double c = 1.2;
   const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
 
-  aom_clear_system_state();
-
   // Loop through each 'block_size' X 'block_size' block.
   for (int row = 0; row < num_rows; row++) {
     for (int col = 0; col < num_cols; col++) {
@@ -1647,24 +1755,23 @@ void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
       }
       const double rk = intra_cost / mc_dep_cost;
       const int index = row * num_cols + col;
-      cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+      cpi->ppi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
     }
   }
-  aom_clear_system_state();
 }
 
 void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
-
-  if (tpl_frame->is_valid == 0) return;
-  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+
   if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
+  TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+  if (!tpl_frame->is_valid) return;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
 
   const int mi_col_sr =
@@ -1685,13 +1792,12 @@ void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
   double base_block_count = 0.0;
   double log_sum = 0.0;
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col_sr / num_mi_h;
          col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+      log_sum += log(cpi->ppi->tpl_rdmult_scaling_factors[index]);
       base_block_count += 1.0;
     }
   }
@@ -1705,33 +1811,30 @@ void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
   const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
 
   double scale_adj = log(scaling_factor) - log_sum / base_block_count;
-  scale_adj = exp(scale_adj);
+  scale_adj = exp_bounded(scale_adj);
 
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col_sr / num_mi_h;
          col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      cpi->tpl_sb_rdmult_scaling_factors[index] =
-          scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+      cpi->ppi->tpl_sb_rdmult_scaling_factors[index] =
+          scale_adj * cpi->ppi->tpl_rdmult_scaling_factors[index];
     }
   }
-  aom_clear_system_state();
 }
 
-#define EPSILON (0.0000001)
-
 double av1_exponential_entropy(double q_step, double b) {
-  aom_clear_system_state();
-  double z = fmax(exp(-q_step / b), EPSILON);
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
   return -log2(1 - z) - z * log2(z) / (1 - z);
 }
 
 double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
-  aom_clear_system_state();
   // zero bin's size is zero_bin_ratio * q_step
   // non-zero bin's size is q_step
-  double z = fmax(exp(-zero_bin_ratio / 2 * q_step / b), EPSILON);
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
   double h = av1_exponential_entropy(q_step, b);
   double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1);
   return r;
@@ -1740,7 +1843,6 @@ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
 double av1_laplace_estimate_frame_rate(int q_index, int block_count,
                                        const double *abs_coeff_mean,
                                        int coeff_num) {
-  aom_clear_system_state();
   double zero_bin_ratio = 2;
   double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
   double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
@@ -1755,3 +1857,58 @@ double av1_laplace_estimate_frame_rate(int q_index, int block_count,
   est_rate *= block_count;
   return est_rate;
 }
+
+double av1_estimate_gop_bitrate(const unsigned char *q_index_list,
+                                const int frame_count,
+                                const TplTxfmStats *stats_list) {
+  double gop_bitrate = 0;
+  for (int frame_index = 0; frame_index < frame_count; frame_index++) {
+    int q_index = q_index_list[frame_index];
+    TplTxfmStats frame_stats = stats_list[frame_index];
+
+    /* Convert to mean absolute deviation */
+    double abs_coeff_mean[256] = { 0 };
+    for (int i = 0; i < 256; i++) {
+      abs_coeff_mean[i] =
+          frame_stats.abs_coeff_sum[i] / frame_stats.txfm_block_count;
+    }
+
+    double frame_bitrate = av1_laplace_estimate_frame_rate(
+        q_index, frame_stats.txfm_block_count, abs_coeff_mean, 256);
+    gop_bitrate += frame_bitrate;
+  }
+  return gop_bitrate;
+}
+
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff) {
+  b = AOMMAX(b, TPL_EPSILON);
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double r = -log2(1 - z0);
+    return r;
+  } else {
+    double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+    double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z);
+    return r;
+  }
+}
+
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num) {
+  double zero_bin_ratio = 2;
+  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double est_rate = 0;
+  // dc coeff
+  est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
+                                         zero_bin_ratio, qcoeff_arr[0]);
+  // ac coeff
+  for (int i = 1; i < coeff_num; ++i) {
+    est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
+                                           zero_bin_ratio, qcoeff_arr[i]);
+  }
+  return est_rate;
+}
diff --git a/third_party/libaom/source/libaom/av1/encoder/tpl_model.h b/third_party/libaom/source/libaom/av1/encoder/tpl_model.h
index 4b85740f3e..c764d92239 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tpl_model.h
+++ b/third_party/libaom/source/libaom/av1/encoder/tpl_model.h
@@ -18,11 +18,20 @@ extern "C" {
 
 /*!\cond */
 
+struct AV1_PRIMARY;
 struct AV1_COMP;
+struct AV1_SEQ_CODING_TOOLS;
 struct EncodeFrameParams;
 struct EncodeFrameInput;
 
-#include "av1/encoder/encoder.h"
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/lookahead.h"
 
 static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
   switch (length) {
@@ -82,6 +91,14 @@ typedef struct AV1TplRowMultiThreadInfo {
 #define MAX_TPL_EXTEND (MAX_LAG_BUFFERS - MAX_GF_INTERVAL)
 #define TPL_DEP_COST_SCALE_LOG2 4
 
+#define TPL_EPSILON 0.0000001
+
+typedef struct TplTxfmStats {
+  double abs_coeff_sum[256];  // Assume we are using 16x16 transform block
+  int txfm_block_count;
+  int coeff_num;
+} TplTxfmStats;
+
 typedef struct TplDepStats {
   int64_t intra_cost;
   int64_t inter_cost;
@@ -90,6 +107,7 @@ typedef struct TplDepStats {
   int64_t cmp_recrf_dist[2];
   int64_t srcrf_rate;
   int64_t recrf_rate;
+  int64_t srcrf_sse;
   int64_t cmp_recrf_rate[2];
   int64_t mc_dep_rate;
   int64_t mc_dep_dist;
@@ -111,10 +129,6 @@ typedef struct TplDepFrame {
   int mi_cols;
   int base_rdmult;
   uint32_t frame_display_index;
-  double abs_coeff_sum[256];  // Assume we are using 16x16 transform block
-  double abs_coeff_mean[256];
-  int coeff_num;  // number of coefficients in a transform block
-  int txfm_block_count;
 } TplDepFrame;
 
 /*!\endcond */
@@ -147,6 +161,12 @@ typedef struct TplParams {
   TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
 
   /*!
+   * Buffer to store tpl transform stats per frame.
+   * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group.
+   */
+  TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS];
+
+  /*!
    * Buffer to store tpl reconstructed frame.
    * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
    */
@@ -192,10 +212,13 @@ typedef struct TplParams {
    */
   int border_in_pixels;
 
-  /*!
-   * Skip tpl setup when tpl data from gop length decision can be reused.
+#if CONFIG_BITRATE_ACCURACY
+  /*
+   * Estimated and actual GOP bitrate.
    */
-  int skip_tpl_setup_stats;
+  double estimated_gop_bitrate;
+  double actual_gop_bitrate;
+#endif
 } TplParams;
 
 /*!\brief Allocate buffers used by tpl model
@@ -206,8 +229,9 @@ typedef struct TplParams {
  * \param[out]   tpl_data  tpl data structure
  */
 
-void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data,
-                           int lag_in_frames);
+void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames);
 
 /*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
  * group) and selects between 16 and 32 frame GOP structure.
@@ -227,6 +251,9 @@ int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,
 
 /*!\cond */
 
+void av1_tpl_preload_rc_estimate(
+    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
 
 void av1_init_tpl_stats(TplParams *const tpl_data);
@@ -236,8 +263,9 @@ void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
 void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col);
 
-void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               BLOCK_SIZE bsize, TX_SIZE tx_size);
+void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
+                               TplTxfmStats *tpl_txfm_stats, MACROBLOCK *x,
+                               int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
 /*!\brief  Compute the entropy of an exponential probability distribution
  * function (pdf) subjected to uniform quantization.
@@ -271,7 +299,7 @@ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio);
 /*!\brief  Compute the frame rate using transform block stats
  *
  * Assume each position i in the transform block is of Laplace distribution
- * with maximum absolute deviation abs_coeff_mean[i]
+ * with mean absolute deviation abs_coeff_mean[i]
  *
  * Then we can use av1_laplace_entropy() to compute the expected frame
  * rate.
@@ -280,7 +308,7 @@ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio);
  *
  * \param[in]    q_index         quantizer index
  * \param[in]    block_count     number of transform blocks
- * \param[in]    abs_coeff_mean  array of maximum absolute deviation
+ * \param[in]    abs_coeff_mean  array of mean absolute deviation
  * \param[in]    coeff_num       number of coefficients per transform block
  *
  * \return expected frame rate
@@ -289,15 +317,104 @@ double av1_laplace_estimate_frame_rate(int q_index, int block_count,
                                        const double *abs_coeff_mean,
                                        int coeff_num);
 
-/*!\brief  Init data structure storing transform stats
+/*
+ *!\brief Compute the number of bits needed to encode a GOP
+ *
+ * \param[in]    q_index_list    array of q_index, one per frame
+ * \param[in]    frame_count     number of frames in the GOP
+ * \param[in]    stats           array of transform stats, one per frame
+ *
+ */
+double av1_estimate_gop_bitrate(const unsigned char *q_index_list,
+                                const int frame_count,
+                                const TplTxfmStats *stats);
+
+/*
+ *!\brief Init TplTxfmStats
+ *
+ * \param[in]    tpl_txfm_stats  a structure for storing transform stats
+ *
+ *
+ */
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats);
+
+/*
+ *!\brief Accumulate TplTxfmStats
+ *
+ * \param[in]  sub_stats          a structure for storing sub transform stats
+ * \param[out] accumulated_stats  a structure for storing accumulated transform
+ *stats
+ *
+ */
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats);
+
+/*
+ *!\brief Record a transform block into  TplTxfmStats
+ *
+ * \param[in]  tpl_txfm_stats     A structure for storing transform stats
+ * \param[out] coeff              An array of transform coefficients. Its size
+ *                                should equal to tpl_txfm_stats.coeff_num.
+ *
+ */
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff);
+
+/*!\brief  Estimate coefficient entropy using Laplace dsitribution
  *
  *\ingroup tpl_modelling
  *
- * \param[in]    tpl_frame       pointer of tpl frame data structure
+ * This function is equivalent to -log2(laplace_prob()), where laplace_prob() is
+ * defined in tpl_model_test.cc
+ *
+ * \param[in]    q_step          quantizer step size without any scaling
+ * \param[in]    b               mean absolute deviation of Laplace distribution
+ * \param[in]    zero_bin_ratio  zero bin's size is zero_bin_ratio * q_step
+ * \param[in]    qcoeff          quantized coefficient
+ *
+ * \return estimated coefficient entropy
+ *
+ */
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff);
+
+/*!\brief  Estimate entropy of a transform block using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_index         quantizer index
+ * \param[in]    abs_coeff_mean  array of mean absolute deviations
+ * \param[in]    qcoeff_arr      array of quantized coefficients
  * \param[in]    coeff_num       number of coefficients per transform block
  *
+ * \return estimated transform block entropy
+ *
+ */
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num);
+
+// TODO(angiebird): Add doxygen description here.
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num);
+
+/*!\brief  Compute the overlap area between two blocks with the same size
+ *
+ *\ingroup tpl_modelling
+ *
+ * If there is no overlap, this function should return zero.
+ *
+ * \param[in]    row_a  row position of the first block
+ * \param[in]    col_a  column position of the first block
+ * \param[in]    row_b  row position of the second block
+ * \param[in]    col_b  column position of the second block
+ * \param[in]    width  width shared by the two blocks
+ * \param[in]    height height shared by the two blocks
+ *
+ * \return overlap area of the two blocks
  */
-void av1_tpl_stats_init_txfm_stats(TplDepFrame *tpl_frame, int coeff_num);
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height);
 
 /*!\endcond */
 #ifdef __cplusplus
diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c
index 39940e8aa6..f82e910595 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c
+++ b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c
@@ -15,24 +15,34 @@
 
 #include "aom_dsp/butteraugli.h"
 #include "aom_ports/system_state.h"
-#include "av1/encoder/rdopt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/extend.h"
+#include "av1/encoder/var_based_part.h"
 
 static const int resize_factor = 2;
 
-void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
-                                       const YV12_BUFFER_CONFIG *source,
-                                       const YV12_BUFFER_CONFIG *recon) {
+static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
+                                              const YV12_BUFFER_CONFIG *source,
+                                              const YV12_BUFFER_CONFIG *recon,
+                                              const double K) {
   AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const aom_color_range_t color_range =
+      seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int width = source->y_crop_width;
   const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
 
   float *diffmap;
   CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap)));
-  if (!aom_calc_butteraugli(source, recon, bit_depth, diffmap)) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+  if (!aom_calc_butteraugli(source, recon, bit_depth,
+                            seq_params->matrix_coefficients, color_range,
+                            diffmap)) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Failed to calculate Butteraugli distances.");
   }
 
@@ -55,6 +65,7 @@ void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
       const int x_start = col * block_w;
       float dbutteraugli = 0.0f;
       float dmse = 0.0f;
+      float px_count = 0.0f;
 
       // Loop through each pixel.
       for (int y = y_start; y < y_start + block_h && y < height; y++) {
@@ -63,25 +74,28 @@ void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
           float px_diff = source->y_buffer[y * source->y_stride + x] -
                           recon->y_buffer[y * recon->y_stride + x];
           dmse += px_diff * px_diff;
+          px_count += 1.0f;
         }
       }
-      for (int y = y_start; y < y_start + block_h && y < height; y += 2) {
-        for (int x = x_start; x < x_start + block_w && x < width; x += 2) {
-          const int src_px_index = y / 2 * source->uv_stride + x / 2;
-          const int recon_px_index = y / 2 * recon->uv_stride + x / 2;
+      const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+                               (height + ss_y) >> ss_y);
+      for (int y = y_start >> ss_y; y < y_end; y++) {
+        const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+                                 (width + ss_x) >> ss_x);
+        for (int x = x_start >> ss_x; x < x_end; x++) {
+          const int src_px_index = y * source->uv_stride + x;
+          const int recon_px_index = y * recon->uv_stride + x;
           const float px_diff_u = (float)(source->u_buffer[src_px_index] -
                                           recon->u_buffer[recon_px_index]);
           const float px_diff_v = (float)(source->v_buffer[src_px_index] -
                                           recon->v_buffer[recon_px_index]);
           dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+          px_count += 2.0f;
         }
       }
 
       dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
-      dmse = dmse / (2.0f * (float)block_w * (float)block_h);
-      // 'K' is used to balance the rate-distortion distribution between PSNR
-      // and Butteraugli.
-      const double K = 0.4;
+      dmse = dmse / px_count;
       const float eps = 0.01f;
       double weight;
       if (dbutteraugli < eps || dmse < eps) {
@@ -166,10 +180,12 @@ static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                      int width, int height) {
   copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
              height);
+  const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
+  const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
   copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-             width / 2, height / 2);
+             width_uv, height_uv);
   copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-             width / 2, height / 2);
+             width_uv, height_uv);
 }
 
 static void zero_plane(uint8_t *dst, int dst_stride, int h) {
@@ -192,9 +208,11 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) {
   const int width = cpi->source->y_crop_width;
   const int height = cpi->source->y_crop_height;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
   if (dst->buffer_alloc_sz == 0) {
     aom_alloc_frame_buffer(
-        dst, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+        dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
         cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
   }
   av1_copy_and_extend_frame(cpi->source, dst);
@@ -202,8 +220,8 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) {
   YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source;
   if (resized_dst->buffer_alloc_sz == 0) {
     aom_alloc_frame_buffer(
-        resized_dst, width / resize_factor, height / resize_factor, 1, 1,
-        cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+        resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y,
+        cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
         cm->features.byte_alignment);
   }
   av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth,
@@ -215,25 +233,86 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) {
   aom_clear_system_state();
 }
 
-void av1_restore_butteraugli_source(AV1_COMP *cpi) {
+void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
   aom_clear_system_state();
   av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source);
   AV1_COMMON *const cm = &cpi->common;
   const int width = cpi->source->y_crop_width;
   const int height = cpi->source->y_crop_height;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
 
   YV12_BUFFER_CONFIG resized_recon;
   memset(&resized_recon, 0, sizeof(resized_recon));
   aom_alloc_frame_buffer(
-      &resized_recon, width / resize_factor, height / resize_factor, 1, 1,
-      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+      &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
   copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
            height / resize_factor);
 
   set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source,
-                                    &resized_recon);
+                                    &resized_recon, K);
   cpi->butteraugli_info.recon_set = true;
   aom_free_frame_buffer(&resized_recon);
   aom_clear_system_state();
 }
+
+void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  const int q_index = 96;
+  aom_clear_system_state();
+
+  // Setup necessary params for encoding, including frame source, etc.
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+  av1_set_frame_size(cpi, cm->superres_upscaled_width,
+                     cm->superres_upscaled_height);
+
+  cpi->source =
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+                            cm->features.interp_filter, 0, false, false);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        cm->features.interp_filter, 0, false, false);
+  }
+
+  av1_setup_butteraugli_source(cpi);
+  av1_setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  const PARTITION_SEARCH_TYPE partition_search_type =
+      cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size;
+  // Enable a quicker pass by uncommenting the following lines:
+  // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+
+  av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index,
+                    q_cfg->enable_chroma_deltaq);
+  av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+  if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
+    av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                       cm->seq_params->bit_depth);
+
+  av1_set_variance_partition_thresholds(cpi, q_index, 0);
+  av1_encode_frame(cpi);
+
+  av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3);
+  cpi->sf.part_sf.partition_search_type = partition_search_type;
+  cpi->sf.part_sf.fixed_partition_size = fixed_partition_size;
+}
diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h
index a4af31c718..7b7b0b64d3 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h
+++ b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h
@@ -38,6 +38,10 @@ void av1_setup_butteraugli_recon(AV1_COMP *cpi,
 
 void av1_setup_butteraugli_source(AV1_COMP *cpi);
 
-void av1_restore_butteraugli_source(AV1_COMP *cpi);
+// 'K' is used to balance the rate-distortion distribution between PSNR
+// and Butteraugli.
+void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K);
+
+void av1_setup_butteraugli_rdmult(AV1_COMP *cpi);
 
 #endif  // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c
index f5b6129407..0c28cebefa 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c
+++ b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c
@@ -15,9 +15,7 @@
 #include "aom_ports/system_state.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/rdopt.h"
-#if CONFIG_USE_VMAF_RC
 #include "config/aom_scale_rtcd.h"
-#endif
 
 static const double kBaselineVmaf = 97.42773;
 
@@ -89,9 +87,9 @@ static unsigned int residual_variance(const AV1_COMP *cpi,
   assert(y_stride == ref->y_stride);
   const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
   const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
-  const unsigned int var =
-      cpi->fn_ptr[block_size].vf(ref->y_buffer + y_offset + mv_offset, y_stride,
-                                 src->y_buffer + y_offset, y_stride, sse);
+  const unsigned int var = cpi->ppi->fn_ptr[block_size].vf(
+      ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset,
+      y_stride, sse);
   return var;
 }
 
@@ -117,7 +115,7 @@ static double frame_average_variance(const AV1_COMP *const cpi,
       buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
       buf.stride = y_stride;
 
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         assert(frame->flags & YV12_FLAG_HIGHBITDEPTH);
         var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
                                                   bit_depth);
@@ -234,7 +232,7 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
                                const YV12_BUFFER_CONFIG *blurred,
                                const YV12_BUFFER_CONFIG *dst, double amount) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  if (cpi->common.seq_params.use_highbitdepth) {
+  if (cpi->common.seq_params->use_highbitdepth) {
     assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
     assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
     assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
@@ -294,38 +292,27 @@ static AOM_INLINE void gaussian_blur(const int bit_depth,
 }
 
 static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi,
-#if CONFIG_USE_VMAF_RC
-                                         VmafContext *vmaf_context,
-                                         int *vmaf_cal_index,
-#endif
                                          double source_variance,
                                          YV12_BUFFER_CONFIG *const source,
                                          YV12_BUFFER_CONFIG *const sharpened) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
   double new_vmaf;
 
-#if CONFIG_USE_VMAF_RC
-  aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, source,
-                            sharpened, bit_depth, *vmaf_cal_index, &new_vmaf);
-  (*vmaf_cal_index)++;
-#else
-  aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, sharpened,
-                bit_depth, &new_vmaf);
-#endif
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth,
+                cal_vmaf_neg, &new_vmaf);
 
   const double sharpened_var = frame_average_variance(cpi, sharpened);
   return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
 }
 
 static double find_best_frame_unsharp_amount_loop(
-    const AV1_COMP *const cpi,
-#if CONFIG_USE_VMAF_RC
-    VmafContext *vmaf_context, int *vmaf_cal_index,
-#endif
-    YV12_BUFFER_CONFIG *const source, YV12_BUFFER_CONFIG *const blurred,
-    YV12_BUFFER_CONFIG *const sharpened, double best_vmaf,
-    const double baseline_variance, const double unsharp_amount_start,
-    const double step_size, const int max_loop_count, const double max_amount) {
+    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+    YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+    double best_vmaf, const double baseline_variance,
+    const double unsharp_amount_start, const double step_size,
+    const int max_loop_count, const double max_amount) {
   const double min_amount = 0.0;
   int loop_count = 0;
   double approx_vmaf = best_vmaf;
@@ -335,11 +322,7 @@ static double find_best_frame_unsharp_amount_loop(
     unsharp_amount += step_size;
     if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
     unsharp(cpi, source, blurred, sharpened, unsharp_amount);
-    approx_vmaf = cal_approx_vmaf(cpi,
-#if CONFIG_USE_VMAF_RC
-                                  vmaf_context, vmaf_cal_index,
-#endif
-                                  baseline_variance, source, sharpened);
+    approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
 
     loop_count++;
   } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
@@ -358,73 +341,43 @@ static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
-#if CONFIG_USE_VMAF_RC
-  VmafContext *vmaf_context;
-  aom_init_vmaf_context_rc(
-      &vmaf_context, cpi->vmaf_info.vmaf_model,
-      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN);
-  int vmaf_cal_index = 0;
-#endif
   YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
   aom_alloc_frame_buffer(
-      &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &sharpened, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
 
   const double baseline_variance = frame_average_variance(cpi, source);
   double unsharp_amount;
   if (unsharp_amount_start <= step_size) {
     unsharp_amount = find_best_frame_unsharp_amount_loop(
-        cpi,
-#if CONFIG_USE_VMAF_RC
-        vmaf_context, &vmaf_cal_index,
-#endif
-        source, blurred, &sharpened, 0.0, baseline_variance, 0.0, step_size,
-        max_loop_count, max_filter_amount);
+        cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+        step_size, max_loop_count, max_filter_amount);
   } else {
     double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
     double v0, v1;
     unsharp(cpi, source, blurred, &sharpened, a0);
-    v0 = cal_approx_vmaf(cpi,
-#if CONFIG_USE_VMAF_RC
-                         vmaf_context, &vmaf_cal_index,
-#endif
-                         baseline_variance, source, &sharpened);
+    v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
     unsharp(cpi, source, blurred, &sharpened, a1);
-    v1 = cal_approx_vmaf(cpi,
-#if CONFIG_USE_VMAF_RC
-                         vmaf_context, &vmaf_cal_index,
-#endif
-                         baseline_variance, source, &sharpened);
+    v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
     if (fabs(v0 - v1) < 0.01) {
       unsharp_amount = a0;
     } else if (v0 > v1) {
       unsharp_amount = find_best_frame_unsharp_amount_loop(
-          cpi,
-#if CONFIG_USE_VMAF_RC
-          vmaf_context, &vmaf_cal_index,
-#endif
-          source, blurred, &sharpened, v0, baseline_variance, a0, -step_size,
-          max_loop_count, max_filter_amount);
+          cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+          -step_size, max_loop_count, max_filter_amount);
     } else {
       unsharp_amount = find_best_frame_unsharp_amount_loop(
-          cpi,
-#if CONFIG_USE_VMAF_RC
-          vmaf_context, &vmaf_cal_index,
-#endif
-          source, blurred, &sharpened, v1, baseline_variance, a1, step_size,
-          max_loop_count, max_filter_amount);
+          cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+          step_size, max_loop_count, max_filter_amount);
     }
   }
 
   aom_free_frame_buffer(&sharpened);
-#if CONFIG_USE_VMAF_RC
-  aom_close_vmaf_context_rc(vmaf_context);
-#endif
   return unsharp_amount;
 }
 
-#if CONFIG_USE_VMAF_RC
 void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
                                 YV12_BUFFER_CONFIG *const source) {
   aom_clear_system_state();
@@ -433,9 +386,9 @@ void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
   const int width = source->y_width;
   const int height = source->y_height;
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
-      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double best_frame_unsharp_amount =
       get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
 
@@ -444,15 +397,15 @@ void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
   aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, source, &blurred);
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
   aom_free_frame_buffer(&blurred);
   aom_clear_system_state();
 }
-#endif
 
 void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
                                   YV12_BUFFER_CONFIG *const source) {
@@ -466,19 +419,21 @@ void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
   memset(&source_extended, 0, sizeof(source_extended));
   memset(&blurred, 0, sizeof(blurred));
   aom_alloc_frame_buffer(
-      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      &source_extended, width, height, source->subsampling_x,
+      source->subsampling_y, cm->seq_params->use_highbitdepth,
       cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
   aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
-      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double last_frame_unsharp_amount =
       get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
 
@@ -500,24 +455,27 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
   const int width = source->y_width;
   const int height = source->y_height;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
 
   YV12_BUFFER_CONFIG source_extended, blurred;
   memset(&blurred, 0, sizeof(blurred));
   memset(&source_extended, 0, sizeof(source_extended));
   aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
       cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
-      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double last_frame_unsharp_amount =
       get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
 
@@ -540,12 +498,14 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
   YV12_BUFFER_CONFIG source_block, blurred_block;
   memset(&source_block, 0, sizeof(source_block));
   memset(&blurred_block, 0, sizeof(blurred_block));
-  aom_alloc_frame_buffer(
-      &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
@@ -555,7 +515,7 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
       const int block_height = AOMMIN(height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (cm->seq_params.use_highbitdepth) {
+      if (cm->seq_params->use_highbitdepth) {
         assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
         assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
@@ -624,7 +584,7 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (cm->seq_params.use_highbitdepth) {
+      if (cm->seq_params->use_highbitdepth) {
         assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
         assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
@@ -654,93 +614,6 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
   aom_clear_system_state();
 }
 
-#if !CONFIG_USE_VMAF_RC
-typedef struct FrameData {
-  const YV12_BUFFER_CONFIG *source, *blurred;
-  int block_w, block_h, num_rows, num_cols, row, col, bit_depth;
-} FrameData;
-
-// A callback function used to pass data to VMAF.
-// Returns 0 after reading a frame.
-// Returns 2 when there is no more frame to read.
-static int update_frame(float *ref_data, float *main_data, float *temp_data,
-                        int stride, void *user_data) {
-  FrameData *frames = (FrameData *)user_data;
-  const int width = frames->source->y_width;
-  const int height = frames->source->y_height;
-  const int row = frames->row;
-  const int col = frames->col;
-  const int num_rows = frames->num_rows;
-  const int num_cols = frames->num_cols;
-  const int block_w = frames->block_w;
-  const int block_h = frames->block_h;
-  const YV12_BUFFER_CONFIG *source = frames->source;
-  const YV12_BUFFER_CONFIG *blurred = frames->blurred;
-  const int bit_depth = frames->bit_depth;
-  const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
-  (void)temp_data;
-  stride /= (int)sizeof(*ref_data);
-
-  for (int i = 0; i < height; ++i) {
-    float *ref, *main;
-    ref = ref_data + i * stride;
-    main = main_data + i * stride;
-    if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *src;
-      src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = scale_factor * (float)src[j];
-      }
-    } else {
-      uint8_t *src;
-      src = source->y_buffer + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = (float)src[j];
-      }
-    }
-  }
-  if (row < num_rows && col < num_cols) {
-    // Set current block
-    const int row_offset = row * block_h;
-    const int col_offset = col * block_w;
-    const int block_width = AOMMIN(width - col_offset, block_w);
-    const int block_height = AOMMIN(height - row_offset, block_h);
-
-    float *main_buf = main_data + col_offset + row_offset * stride;
-    if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
-                              row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = scale_factor * (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
-      }
-    } else {
-      uint8_t *blurred_buf =
-          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
-      }
-    }
-
-    frames->col++;
-    if (frames->col >= num_cols) {
-      frames->col = 0;
-      frames->row++;
-    }
-    return 0;
-  } else {
-    return 2;
-  }
-}
-#endif
-
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   const int y_width = cpi->source->y_width;
@@ -748,13 +621,15 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
   const int resized_block_size = BLOCK_32X32;
   const int resize_factor = 2;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
 
   aom_clear_system_state();
   YV12_BUFFER_CONFIG resized_source;
   memset(&resized_source, 0, sizeof(resized_source));
   aom_alloc_frame_buffer(
-      &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
-      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+      &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
+      ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
   av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
                                            bit_depth, av1_num_planes(cm));
@@ -770,42 +645,26 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
 
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x,
+                         ss_y, cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   gaussian_blur(bit_depth, &resized_source, &blurred);
 
-#if CONFIG_USE_VMAF_RC
   YV12_BUFFER_CONFIG recon;
   memset(&recon, 0, sizeof(recon));
-  aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
+  aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_yv12_copy_frame(&resized_source, &recon, 1);
 
   VmafContext *vmaf_context;
-  aom_init_vmaf_context_rc(
-      &vmaf_context, cpi->vmaf_info.vmaf_model,
-      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN);
-#else
-  double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
-  memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
-  FrameData frame_data;
-  frame_data.source = &resized_source;
-  frame_data.blurred = &blurred;
-  frame_data.block_w = resized_block_w;
-  frame_data.block_h = resized_block_h;
-  frame_data.num_rows = num_rows;
-  frame_data.num_cols = num_cols;
-  frame_data.row = 0;
-  frame_data.col = 0;
-  frame_data.bit_depth = bit_depth;
-  aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.tune_cfg.vmaf_model_path,
-                            update_frame, resized_y_width, resized_y_height,
-                            bit_depth, scores);
-#endif
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg);
+  unsigned int *sses = aom_malloc(sizeof(*sses) * (num_rows * num_cols));
+  memset(sses, 0, sizeof(*sses) * (num_rows * num_cols));
 
   // Loop through each 'block_size' block.
   for (int row = 0; row < num_rows; ++row) {
@@ -820,15 +679,14 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
       uint8_t *const blurred_buf =
           blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
 
-      unsigned int sse;
-      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
-                                         blurred_buf, blurred.y_stride, &sse);
+      cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                              blurred_buf, blurred.y_stride,
+                                              &sses[index]);
 
-#if CONFIG_USE_VMAF_RC
       uint8_t *const recon_buf =
           recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
       // Set recon buf
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
                             CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
                             CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
@@ -839,13 +697,11 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
                      resized_block_w, resized_block_h, 0.0);
       }
 
-      double vmaf;
-      aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model,
-                                &resized_source, &recon, bit_depth, index,
-                                &vmaf);
+      aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth,
+                          index);
 
       // Restore recon buf
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         highbd_unsharp_rect(
             CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
             CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
@@ -856,13 +712,18 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
                      resized_source.y_stride, recon_buf, recon.y_stride,
                      resized_block_w, resized_block_h, 0.0);
       }
-#else
-      const double vmaf = scores[index];
-#endif
+    }
+  }
+  aom_flush_vmaf_context(vmaf_context);
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const double vmaf = aom_calc_vmaf_at_index(
+          vmaf_context, cpi->vmaf_info.vmaf_model, index);
       const double dvmaf = kBaselineVmaf - vmaf;
 
       const double mse =
-          (double)sse / (double)(resized_y_width * resized_y_height);
+          (double)sses[index] / (double)(resized_y_width * resized_y_height);
       double weight;
       const double eps = 0.01 / (num_rows * num_cols);
       if (dvmaf < eps || mse < eps) {
@@ -879,11 +740,8 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
 
   aom_free_frame_buffer(&resized_source);
   aom_free_frame_buffer(&blurred);
-#if CONFIG_USE_VMAF_RC
-  aom_close_vmaf_context_rc(vmaf_context);
-#else
-  aom_free(scores);
-#endif
+  aom_close_vmaf_context(vmaf_context);
+  aom_free(sses);
   aom_clear_system_state();
 }
 
@@ -967,27 +825,32 @@ static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
   const int y_height = cur->y_height;
   YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cur->subsampling_x;
+  const int ss_y = cur->subsampling_y;
 
   memset(&blurred_cur, 0, sizeof(blurred_cur));
   memset(&blurred_last, 0, sizeof(blurred_last));
   memset(&blurred_next, 0, sizeof(blurred_next));
 
-  aom_alloc_frame_buffer(
-      &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, cur, &blurred_cur);
   gaussian_blur(bit_depth, last, &blurred_last);
   if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
     assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
     assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
     const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
@@ -1026,9 +889,9 @@ static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
                                            YV12_BUFFER_CONFIG **last,
                                            YV12_BUFFER_CONFIG **next) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const int src_index =
-      cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[gf_group->index];
+      cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index];
   struct lookahead_entry *last_entry = av1_lookahead_peek(
       cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage);
   struct lookahead_entry *next_entry = av1_lookahead_peek(
@@ -1046,9 +909,9 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
     return current_qindex;
   }
   aom_clear_system_state();
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
-      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double last_frame_ysse =
       get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth);
   const double last_frame_vmaf =
@@ -1065,7 +928,7 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
   }
   YV12_BUFFER_CONFIG *cur_buf = cpi->source;
   if (cm->show_frame == 0) {
-    const int src_index = gf_group->arf_src_offset[gf_group->index];
+    const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
     struct lookahead_entry *cur_entry = av1_lookahead_peek(
         cpi->ppi->lookahead, src_index, cpi->compressor_stage);
     cur_buf = &cur_entry->img;
@@ -1084,7 +947,8 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
   const double dsse = dvmaf * approx_sse / approx_dvmaf;
 
   const double beta = approx_sse / (dsse + approx_sse);
-  const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta);
+  const int offset =
+      av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta);
   int qindex = current_qindex + offset;
 
   qindex = AOMMIN(qindex, MAXQ);
@@ -1094,23 +958,23 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
   return qindex;
 }
 
-#if CONFIG_USE_VMAF_RC
 static AOM_INLINE double cal_approx_score(
-    AV1_COMP *const cpi, VmafContext *vmaf_context, int vmaf_cal_index,
-    double src_variance, double new_variance, double src_score,
-    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon_sharpened) {
+    AV1_COMP *const cpi, double src_variance, double new_variance,
+    double src_score, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon_sharpened) {
   double score;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
-  aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, src,
-                            recon_sharpened, bit_depth, vmaf_cal_index, &score);
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth,
+                cal_vmaf_neg, &score);
   return src_variance / new_variance * (score - src_score);
 }
 
 static double find_best_frame_unsharp_amount_loop_neg(
-    AV1_COMP *const cpi, VmafContext *vmaf_context, double src_variance,
-    double base_score, YV12_BUFFER_CONFIG *const src,
-    YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
-    YV12_BUFFER_CONFIG *const src_blurred,
+    AV1_COMP *const cpi, double src_variance, double base_score,
+    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
+    YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred,
     YV12_BUFFER_CONFIG *const recon_blurred,
     YV12_BUFFER_CONFIG *const src_sharpened,
     YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs,
@@ -1120,7 +984,6 @@ static double find_best_frame_unsharp_amount_loop_neg(
   int loop_count = 0;
   double approx_score = best_score;
   double unsharp_amount = unsharp_amount_start;
-  int vmaf_cal_index = 3;
 
   do {
     best_score = approx_score;
@@ -1130,9 +993,8 @@ static double find_best_frame_unsharp_amount_loop_neg(
     unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount);
     const double new_variance =
         residual_frame_average_variance(cpi, src_sharpened, ref, mvs);
-    approx_score =
-        cal_approx_score(cpi, vmaf_context, vmaf_cal_index++, src_variance,
-                         new_variance, base_score, src, recon_sharpened);
+    approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score,
+                                    src, recon_sharpened);
 
     loop_count++;
   } while (approx_score > best_score && loop_count < max_loop_count);
@@ -1143,11 +1005,11 @@ static double find_best_frame_unsharp_amount_loop_neg(
 }
 
 static double find_best_frame_unsharp_amount_neg(
-    AV1_COMP *const cpi, VmafContext *vmaf_context,
-    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
-    YV12_BUFFER_CONFIG *const ref, double base_score,
-    const double unsharp_amount_start, const double step_size,
-    const int max_loop_count, const double max_filter_amount) {
+    AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
+    double base_score, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count,
+    const double max_filter_amount) {
   FULLPEL_MV *mvs = NULL;
   const double src_variance =
       residual_frame_average_variance(cpi, src, ref, mvs);
@@ -1156,22 +1018,28 @@ static double find_best_frame_unsharp_amount_neg(
   const int width = recon->y_width;
   const int height = recon->y_height;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = recon->subsampling_x;
+  const int ss_y = recon->subsampling_y;
+
   YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened;
   memset(&recon_sharpened, 0, sizeof(recon_sharpened));
   memset(&src_sharpened, 0, sizeof(src_sharpened));
   memset(&recon_blurred, 0, sizeof(recon_blurred));
   memset(&src_blurred, 0, sizeof(src_blurred));
+  aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
   aom_alloc_frame_buffer(
-      &recon_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &src_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &recon_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &src_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
       cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, recon, &recon_blurred);
@@ -1181,32 +1049,28 @@ static double find_best_frame_unsharp_amount_neg(
   unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start);
   const double variance_start =
       residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
-  const double score_start =
-      cal_approx_score(cpi, vmaf_context, 1, src_variance, variance_start,
-                       base_score, src, &recon_sharpened);
+  const double score_start = cal_approx_score(
+      cpi, src_variance, variance_start, base_score, src, &recon_sharpened);
 
   const double unsharp_amount_next = unsharp_amount_start + step_size;
   unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next);
   unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next);
   const double variance_next =
       residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
-  const double score_next =
-      cal_approx_score(cpi, vmaf_context, 2, src_variance, variance_next,
-                       base_score, src, &recon_sharpened);
+  const double score_next = cal_approx_score(cpi, src_variance, variance_next,
+                                             base_score, src, &recon_sharpened);
 
   double unsharp_amount;
   if (score_next > score_start) {
     unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
-        cpi, vmaf_context, src_variance, base_score, src, recon, ref,
-        &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs,
-        score_next, unsharp_amount_next, step_size, max_loop_count,
-        max_filter_amount);
+        cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+        &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next,
+        unsharp_amount_next, step_size, max_loop_count, max_filter_amount);
   } else {
     unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
-        cpi, vmaf_context, src_variance, base_score, src, recon, ref,
-        &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs,
-        score_start, unsharp_amount_start, -step_size, max_loop_count,
-        max_filter_amount);
+        cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+        &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start,
+        unsharp_amount_start, -step_size, max_loop_count, max_filter_amount);
   }
 
   aom_free_frame_buffer(&recon_sharpened);
@@ -1216,29 +1080,21 @@ static double find_best_frame_unsharp_amount_neg(
   aom_free(mvs);
   return unsharp_amount;
 }
-#endif  // CONFIG_USE_VMAF_RC
 
 void av1_update_vmaf_curve(AV1_COMP *cpi) {
   YV12_BUFFER_CONFIG *source = cpi->source;
   YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
-      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
-#if CONFIG_USE_VMAF_RC
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   double base_score;
-  VmafContext *vmaf_context;
-  aom_init_vmaf_context_rc(
-      &vmaf_context, cpi->vmaf_info.vmaf_model,
-      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN);
-  aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, source,
-                            recon, bit_depth, 0, &base_score);
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth,
+                cal_vmaf_neg, &base_score);
   cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
-#else
-  aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, recon, bit_depth,
-                &cpi->vmaf_info.last_frame_vmaf[layer_depth]);
-#endif  // CONFIG_USE_VMAF_RC
-  if (cpi->common.seq_params.use_highbitdepth) {
+  if (cpi->common.seq_params->use_highbitdepth) {
     assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
     assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
     cpi->vmaf_info.last_frame_ysse[layer_depth] =
@@ -1248,7 +1104,6 @@ void av1_update_vmaf_curve(AV1_COMP *cpi) {
         (double)aom_get_y_sse(source, recon);
   }
 
-#if CONFIG_USE_VMAF_RC
   if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
     YV12_BUFFER_CONFIG *last, *next;
     get_neighbor_frames(cpi, &last, &next);
@@ -1256,10 +1111,8 @@ void av1_update_vmaf_curve(AV1_COMP *cpi) {
         get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
     const int max_loop_count = 5;
     cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
-        find_best_frame_unsharp_amount_neg(
-            cpi, vmaf_context, source, recon, last, base_score,
-            best_unsharp_amount_start, 0.025, max_loop_count, 1.01);
+        find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score,
+                                           best_unsharp_amount_start, 0.025,
+                                           max_loop_count, 1.01);
   }
-  aom_close_vmaf_context_rc(vmaf_context);
-#endif  // CONFIG_USE_VMAF_RC
 }
diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h
index 01c3068bf0..4625fb9061 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h
+++ b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h
@@ -36,10 +36,8 @@ typedef struct {
   // Stores the origial qindex before scaling.
   int original_qindex;
 
-#if CONFIG_USE_VMAF_RC
   // VMAF model used in VMAF caculations.
   VmafModel *vmaf_model;
-#endif
 } TuneVMAFInfo;
 
 typedef struct AV1_COMP AV1_COMP;
@@ -48,9 +46,7 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
 
 void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
 
-#ifdef CONFIG_USE_VMAF_RC
 void av1_vmaf_neg_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
-#endif
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi);
 
diff --git a/third_party/libaom/source/libaom/av1/encoder/tx_search.c b/third_party/libaom/source/libaom/av1/encoder/tx_search.c
index 30aac0a349..e65b70f788 100644
--- a/third_party/libaom/source/libaom/av1/encoder/tx_search.c
+++ b/third_party/libaom/source/libaom/av1/encoder/tx_search.c
@@ -618,7 +618,7 @@ static AOM_INLINE void get_energy_distribution_fine(
     assert(bw <= 32);
     assert(bh <= 32);
     assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
-    if (cpi->common.seq_params.use_highbitdepth) {
+    if (cpi->common.seq_params->use_highbitdepth) {
       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
       for (int i = 0; i < bh; ++i)
@@ -643,43 +643,43 @@ static AOM_INLINE void get_energy_distribution_fine(
     const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
     assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
     assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[1]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[2]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[3]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[1]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[2]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[3]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[5]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[6]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[7]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[5]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[6]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[7]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[9]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[10]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[11]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[9]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[10]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[11]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[13]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[14]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[15]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[13]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[14]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[15]);
   }
 
   double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
@@ -769,13 +769,13 @@ static AOM_INLINE void get_2x2_normalized_sses_and_sads(
 
         if (sse_norm_arr) {
           unsigned int this_sse;
-          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
-                                        dst_stride, &this_sse);
+          cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                             dst_stride, &this_sse);
           sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
         }
 
         if (sad_norm_arr) {
-          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+          const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf(
               this_src, src_stride, this_dst, dst_stride);
           sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
         }
@@ -832,11 +832,11 @@ static AOM_INLINE void PrintTransformUnitStats(
   const uint8_t *const dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   unsigned int sse;
-  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
-      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+      cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
   const double sad_norm = (double)sad / num_samples;
 
   fprintf(fout, " %g %g", sse_norm, sad_norm);
@@ -905,8 +905,8 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
 
     if (x->skip_chroma_rd && plane) continue;
 
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
     total_sse += sse;
   }
   total_sse <<= 4;
@@ -1030,7 +1030,7 @@ static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
-      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+      cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
   const double sad_norm =
       (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
 
@@ -1183,7 +1183,7 @@ static unsigned pixel_dist_visible_only(
   unsigned sse;
 
   if (txb_rows == visible_rows && txb_cols == visible_cols) {
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
     return sse;
   }
 
@@ -2024,9 +2024,15 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
     assert(plane == 0);
     allowed_tx_mask = ext_tx_used_flag;
     int num_allowed = 0;
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-    const int *tx_type_probs =
-        cpi->frame_probs.tx_type_probs[update_type][tx_size];
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int *tx_type_probs;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    tx_type_probs =
+        (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size];
+#else
+    tx_type_probs = (int *)cpi->frame_probs.tx_type_probs[update_type][tx_size];
+#endif
     int i;
 
     if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
@@ -2097,25 +2103,8 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
 
 #if CONFIG_RD_DEBUG
 static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
-                                         TX_SIZE tx_size, int blk_row,
-                                         int blk_col, int txb_coeff_cost) {
-  (void)blk_row;
-  (void)blk_col;
-  (void)tx_size;
+                                         int txb_coeff_cost) {
   rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
-
-  {
-    const int txb_h = tx_size_high_unit[tx_size];
-    const int txb_w = tx_size_wide_unit[tx_size];
-    int idx, idy;
-    for (idy = 0; idy < txb_h; ++idy)
-      for (idx = 0; idx < txb_w; ++idx)
-        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
-
-    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
-  }
-  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
-  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
 }
 #endif
 
@@ -2674,8 +2663,7 @@ static AOM_INLINE void try_tx_block_no_split(
            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
   if (pick_skip_txfm) {
 #if CONFIG_RD_DEBUG
-    update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
-                          zero_blk_rate - rd_stats->rate);
+    update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate);
 #endif  // CONFIG_RD_DEBUG
     rd_stats->rate = zero_blk_rate;
     rd_stats->dist = rd_stats->sse;
@@ -2720,11 +2708,12 @@ static AOM_INLINE void try_tx_block_split(
       x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1];
 
   for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+    const int offsetr = blk_row + r;
+    if (offsetr >= max_blocks_high) break;
     for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
       assert(blk_idx < 4);
-      const int offsetr = blk_row + r;
       const int offsetc = blk_col + c;
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+      if (offsetc >= max_blocks_wide) continue;
 
       RD_STATS this_rd_stats;
       int this_cost_valid = 1;
@@ -3173,8 +3162,7 @@ static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
   }
 
 #if CONFIG_RD_DEBUG
-  update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
-                        this_rd_stats.rate);
+  update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate);
 #endif  // CONFIG_RD_DEBUG
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
@@ -3452,15 +3440,18 @@ static AOM_INLINE void tx_block_yrd(
     const int txb_width = tx_size_wide_unit[sub_txs];
     const int txb_height = tx_size_high_unit[sub_txs];
     const int step = txb_height * txb_width;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
     RD_STATS pn_rd_stats;
     int64_t this_rd = 0;
     assert(txb_width > 0 && txb_height > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += txb_height) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += txb_width) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += txb_height) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += txb_width) {
         const int offsetc = blk_col + col;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
         av1_init_rd_stats(&pn_rd_stats);
         tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
diff --git a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c
index 31b86abe64..884d0a9e8b 100644
--- a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c
+++ b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c
@@ -327,16 +327,8 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
   const LV_MAP_EOB_COST *txb_eob_costs =
       &coeff_costs->eob_costs[eob_multi_size][plane_type];
 
-  const int rshift =
-      (sharpness +
-       (cpi->oxcf.q_cfg.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
-            ? 7 - mbmi->segment_id
-            : 2) +
-       (cpi->oxcf.q_cfg.aq_mode != VARIANCE_AQ &&
-                cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL &&
-                cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
-            ? (3 - x->sb_energy_level)
-            : 0));
+  const int rshift = sharpness + 2;
+
   const int64_t rdmult =
       (((int64_t)x->rdmult *
         (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
diff --git a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h
index e86caaa06e..70b322a2e1 100644
--- a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h
+++ b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h
@@ -44,11 +44,11 @@ extern "C" {
  * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
  * \param[out]   rate_cost      The entropy cost of coding the transform block
  * after adjustment of coefficients.
- * \param[in]    sharpness      When sharpness == 1, the function will be less
- * aggressive toward lowering the magnitude of coefficients.
+ * \param[in]    sharpness      When sharpness > 0, the function will be less
+ * aggressive towards lowering the magnitude of coefficients.
  * In this way, the transform block will contain more high-frequency
- coefficients
- * and therefore preserve the sharpness of the reconstructed block.
+ * coefficients and therefore will preserve the sharpness of the reconstructed
+ * block.
  */
 int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int block, TX_SIZE tx_size, TX_TYPE tx_type,
diff --git a/third_party/libaom/source/libaom/av1/encoder/var_based_part.c b/third_party/libaom/source/libaom/av1/encoder/var_based_part.c
index a42be4553f..8907d0d0ba 100644
--- a/third_party/libaom/source/libaom/av1/encoder/var_based_part.c
+++ b/third_party/libaom/source/libaom/av1/encoder/var_based_part.c
@@ -341,7 +341,7 @@ static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
 
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
                                           int q, int content_lowsumdiff,
-                                          int segment_id) {
+                                          int source_sad, int segment_id) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 120 : 1;
@@ -394,7 +394,6 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
         scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
                                   cm->height, cpi->svc.non_reference_frame);
 #endif
-
     thresholds[0] = threshold_base >> 1;
     thresholds[1] = threshold_base;
     thresholds[3] = threshold_base << cpi->oxcf.speed;
@@ -436,20 +435,45 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
       thresholds[2] = (5 * threshold_base) >> 1;
     }
     if (cpi->sf.rt_sf.force_large_partition_blocks) {
+      double weight;
+      const int win = 20;
+      if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
+        weight = 1.0;
+      else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win)
+        weight = 0.0;
+      else
+        weight =
+            1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win);
+      if (cm->width * cm->height > 640 * 480) {
+        for (int i = 0; i < 4; i++) {
+          thresholds[i] <<= 1;
+        }
+      }
       if (cm->width * cm->height <= 352 * 288) {
         thresholds[1] <<= 2;
         thresholds[2] <<= 5;
         thresholds[3] = INT32_MAX;
-      } else if (cm->width * cm->height > 640 * 480 && segment_id == 0) {
+        // Condition the increase of partition thresholds on the segment
+        // and the content. Avoid the increase for superblocks which have
+        // high source sad, unless the whole frame has very high motion
+        // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
+        // have high source sad).
+      } else if (cm->width * cm->height > 640 * 480 && segment_id == 0 &&
+                 (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) {
         thresholds[0] = (3 * thresholds[0]) >> 1;
         thresholds[3] = INT32_MAX;
-        if (current_qindex >= QINDEX_LARGE_BLOCK_THR) {
-          thresholds[1] <<= 1;
-          thresholds[2] <<= 1;
+        if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
+          thresholds[1] = (int)((1 - weight) * (thresholds[1] << 1) +
+                                weight * thresholds[1]);
+          thresholds[2] = (int)((1 - weight) * (thresholds[2] << 1) +
+                                weight * thresholds[2]);
         }
-      } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0) {
-        thresholds[1] <<= 2;
-        thresholds[2] <<= 5;
+      } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0 &&
+                 (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) {
+        thresholds[1] =
+            (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
+        thresholds[2] =
+            (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
         thresholds[3] = INT32_MAX;
       }
     }
@@ -605,7 +629,7 @@ static AOM_INLINE void set_low_temp_var_flag(
         xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
         xd->mi[0]->mv[0].as_mv.row < mv_thr &&
         xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
-    const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+    const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
     if (is_small_sb)
       set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
                                   &(vt->split[0]), thresholds, mi_col, mi_row);
@@ -621,7 +645,8 @@ void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
   if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0);
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0,
+                       0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -643,10 +668,17 @@ static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
     if (bs != BLOCK_INVALID)
-      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
-                                   pd->dst.stride);
-
-    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+      uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+                                        pd->dst.stride);
+
+    if (uv_sad > (y_sad >> 1))
+      x->color_sensitivity_sb[i - 1] = 1;
+    else if (uv_sad < (y_sad >> 3))
+      x->color_sensitivity_sb[i - 1] = 0;
+    // Borderline case: to be refined at coding block level in nonrd_pickmode,
+    // for coding block size < sb_size.
+    else
+      x->color_sensitivity_sb[i - 1] = 2;
   }
 }
 
@@ -658,7 +690,7 @@ static void fill_variance_tree_leaves(
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int is_key_frame = frame_is_intra_only(cm);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
   // TODO(kyslov) Bring back compute_minmax_variance with content type detection
   const int compute_minmax_variance = 0;
@@ -772,7 +804,7 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
   // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
   // is!!
@@ -783,13 +815,13 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
 
   // For non-SVC GOLDEN is another temporal reference. Check if it should be
   // used as reference for partitioning.
-  if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+  if (!cpi->ppi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
       cpi->sf.rt_sf.use_nonrd_pick_mode) {
     yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
     if (yv12_g && yv12_g != yv12) {
       av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
-      *y_sad_g = cpi->fn_ptr[bsize].sdf(
+      *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
           xd->plane[0].pre[0].stride);
     }
@@ -799,20 +831,20 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
                        get_ref_scale_factors(cm, LAST_FRAME), num_planes);
   mi->ref_frame[0] = LAST_FRAME;
   mi->ref_frame[1] = NONE_FRAME;
-  mi->bsize = cm->seq_params.sb_size;
+  mi->bsize = cm->seq_params->sb_size;
   mi->mv[0].as_int = 0;
   mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
   if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
     if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
       const MV dummy_mv = { 0, 0 };
-      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
+      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
                                              mi_row, mi_col, &dummy_mv);
     }
   }
   if (*y_sad == UINT_MAX) {
-    *y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                    xd->plane[0].pre[0].buf,
-                                    xd->plane[0].pre[0].stride);
+    *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+        xd->plane[0].pre[0].stride);
   }
 
   // Pick the ref frame for partitioning, use golden frame only if its
@@ -834,7 +866,7 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
 
   set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
   av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
-                                cm->seq_params.sb_size, AOM_PLANE_Y,
+                                cm->seq_params->sb_size, AOM_PLANE_Y,
                                 AOM_PLANE_Y);
 }
 
@@ -869,12 +901,12 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
 
   int is_key_frame =
       (frame_is_intra_only(cm) ||
-       (cpi->use_svc &&
+       (cpi->ppi->use_svc &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
 
-  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-         cm->seq_params.sb_size == BLOCK_128X128);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+         cm->seq_params->sb_size == BLOCK_128X128);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
 
   unsigned int y_sad = UINT_MAX;
@@ -900,10 +932,12 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
       cyclic_refresh_segment_id_boosted(segment_id) &&
       cpi->sf.rt_sf.use_nonrd_pick_mode) {
     int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-    set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff, 1);
+    set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
+                       x->content_state_sb.source_sad, 1);
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
-                       x->content_state_sb.low_sumdiff, 0);
+                       x->content_state_sb.low_sumdiff,
+                       x->content_state_sb.source_sad, 0);
   }
 
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
@@ -1025,7 +1059,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
       if (!is_key_frame &&
           (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) &&
           max_var_32x32[m] > thresholds[1] >> 1 &&
-          (noise_level >= kMedium || cpi->use_svc ||
+          (noise_level >= kMedium || cpi->ppi->use_svc ||
            cpi->sf.rt_sf.force_large_partition_blocks ||
            !cpi->sf.rt_sf.use_nonrd_pick_mode)) {
         force_split[1 + m] = 1;
diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c
index b5477ec9ba..68509fa106 100644
--- a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c
+++ b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -352,10 +352,16 @@ void av1_highbd_apply_temporal_filter_avx2(
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -393,6 +399,7 @@ void av1_highbd_apply_temporal_filter_avx2(
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion
diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c
index bbb3771543..1bfdaf72e1 100644
--- a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -227,10 +227,16 @@ void av1_highbd_apply_temporal_filter_sse2(
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -268,6 +274,7 @@ void av1_highbd_apply_temporal_filter_sse2(
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion
diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c
index 72914e1781..8aa07641aa 100644
--- a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c
+++ b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c
@@ -238,10 +238,16 @@ void av1_apply_temporal_filter_avx2(
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -277,6 +283,7 @@ void av1_apply_temporal_filter_avx2(
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion
diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c
index d70792c644..26c3926dca 100644
--- a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c
+++ b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c
@@ -215,10 +215,16 @@ void av1_apply_temporal_filter_sse2(
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -254,6 +260,7 @@ void av1_apply_temporal_filter_sse2(
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion
diff --git a/third_party/libaom/source/libaom/common/args.c b/third_party/libaom/source/libaom/common/args.c
index 64d6e03383..ed622943e3 100644
--- a/third_party/libaom/source/libaom/common/args.c
+++ b/third_party/libaom/source/libaom/common/args.c
@@ -92,7 +92,6 @@ int parse_cfg(const char *file, cfg_options_t *config) {
     GET_PARAMS(disable_intrabc);
     GET_PARAMS(disable_cfl);
     GET_PARAMS(disable_smooth_intra);
-    GET_PARAMS(disable_diagonal_intra);
     GET_PARAMS(disable_filter_intra);
     GET_PARAMS(disable_dual_filter);
     GET_PARAMS(disable_intra_angle_delta);
diff --git a/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c b/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c
index 3aea2cfdd6..da36d9fe13 100644
--- a/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c
+++ b/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c
@@ -271,7 +271,11 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", aom_codec_iface_name(encoder));
 
+#if CONFIG_REALTIME_ONLY
+  res = aom_codec_enc_config_default(encoder, &cfg, 1);
+#else
   res = aom_codec_enc_config_default(encoder, &cfg, 0);
+#endif
   if (res) die_codec(&ecodec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -334,6 +338,12 @@ int main(int argc, char **argv) {
         die_codec(&ecodec, "Failed to set encoder reference frame");
       printf(" <SET_REF>");
 
+#if CONFIG_REALTIME_ONLY
+      // Set cpu speed in encoder.
+      if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7))
+        die_codec(&ecodec, "Failed to set cpu speed");
+#endif
+
       // If set_reference in decoder is commented out, the enc/dec mismatch
       // would be seen.
       if (test_decode) {
diff --git a/third_party/libaom/source/libaom/examples/set_maps.c b/third_party/libaom/source/libaom/examples/set_maps.c
index 69b4bccbe6..5a84faa565 100644
--- a/third_party/libaom/source/libaom/examples/set_maps.c
+++ b/third_party/libaom/source/libaom/examples/set_maps.c
@@ -129,6 +129,14 @@ int main(int argc, char **argv) {
   const int fps = 2;  // TODO(dkovalev) add command line argument
   const double bits_per_pixel_per_frame = 0.067;
 
+#if CONFIG_REALTIME_ONLY
+  const int usage = 1;
+  const int speed = 7;
+#else
+  const int usage = 0;
+  const int speed = 2;
+#endif
+
   exec_name = argv[0];
   if (argc != 6) die("Invalid number of arguments");
 
@@ -157,7 +165,7 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder, &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, usage);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -177,7 +185,7 @@ int main(int argc, char **argv) {
   if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
     die("Failed to initialize encoder");
 
-  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
     die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
diff --git a/third_party/libaom/source/libaom/examples/simple_encoder.c b/third_party/libaom/source/libaom/examples/simple_encoder.c
index 682fe9842b..c026706555 100644
--- a/third_party/libaom/source/libaom/examples/simple_encoder.c
+++ b/third_party/libaom/source/libaom/examples/simple_encoder.c
@@ -163,6 +163,13 @@ int main(int argc, char **argv) {
   const char *infile_arg = NULL;
   const char *outfile_arg = NULL;
   const char *keyframe_interval_arg = NULL;
+#if CONFIG_REALTIME_ONLY
+  const int usage = 1;
+  const int speed = 7;
+#else
+  const int usage = 0;
+  const int speed = 2;
+#endif
 
   exec_name = argv[0];
 
@@ -204,7 +211,7 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder, &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, usage);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -223,7 +230,7 @@ int main(int argc, char **argv) {
   if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
     die("Failed to initialize encoder");
 
-  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
     die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
diff --git a/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c b/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c
index 87e3aa95f1..44bed38318 100644
--- a/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c
+++ b/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c
@@ -24,6 +24,7 @@
 #include "common/args.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
+#include "examples/encoder_util.h"
 #include "aom_ports/aom_timer.h"
 
 #define OPTION_BUFFER_SIZE 1024
@@ -286,6 +287,9 @@ static void parse_command_line(int argc, const char **argv_,
       if (app_input->speed > 9) {
         warn("Mapping speed %d to speed 9.\n", app_input->speed);
       }
+      if (app_input->speed <= 6) {
+        die("Encoder speed setting should be in [7, 9].\n");
+      }
     } else if (arg_match(&arg, &aqmode_arg, argi)) {
       app_input->aq_mode = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &threads_arg, argi)) {
@@ -567,7 +571,7 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt,
   layer_id->spatial_layer_id = spatial_layer_id;
   int lag_index = 0;
   int base_count = superframe_cnt >> 2;
-  // Set the referende map buffer idx for the 7 references:
+  // Set the reference map buffer idx for the 7 references:
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
@@ -795,12 +799,10 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt,
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
           // GOLDEN (and all other refs) to slot 3.
-          // Set LAST2 to slot 4 and Update slot 4.
+          // No update.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 3;
           ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
-          ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
-          ref_frame_config->refresh[4] = 1;
         }
       } else if ((superframe_cnt - 2) % 4 == 0) {
         // Middle temporal enhancement layer.
@@ -837,13 +839,11 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt,
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
-          // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+          // GOLDEN to slot 3. No update.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
           ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift;
           ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
-          ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
-          ref_frame_config->refresh[4] = 1;
         }
       }
       if (layer_id->spatial_layer_id > 0 && !ksvc_mode) {
@@ -998,6 +998,64 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt,
   }
 }
 
+#if CONFIG_AV1_DECODER
+static void test_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
+                        const int frames_out, int *mismatch_seen) {
+  aom_image_t enc_img, dec_img;
+
+  if (*mismatch_seen) return;
+
+  /* Get the internal reference frame */
+  AOM_CODEC_CONTROL_TYPECHECKED(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
+  AOM_CODEC_CONTROL_TYPECHECKED(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
+#endif
+
+  if (!aom_compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
+    aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
+    decoder->err = 1;
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}",
+        frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
+    *mismatch_seen = frames_out;
+  }
+
+  aom_img_free(&enc_img);
+  aom_img_free(&dec_img);
+}
+#endif  // CONFIG_AV1_DECODER
+
 int main(int argc, const char **argv) {
   AppInput app_input;
   AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
@@ -1017,6 +1075,17 @@ int main(int argc, const char **argv) {
   aom_svc_params_t svc_params;
   aom_svc_ref_frame_config_t ref_frame_config;
 
+#if CONFIG_INTERNAL_STATS
+  FILE *stats_file = fopen("opsnr.stt", "a");
+  if (stats_file == NULL) {
+    die("Cannot open opsnr.stt\n");
+  }
+#endif
+#if CONFIG_AV1_DECODER
+  int mismatch_seen = 0;
+  aom_codec_ctx_t decoder;
+#endif
+
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
   int64_t cx_time_sl[3];  // max number of spatial layers.
@@ -1039,11 +1108,12 @@ int main(int argc, const char **argv) {
   app_input.input_ctx.framerate.denominator = 1;
   app_input.input_ctx.only_i420 = 1;
   app_input.input_ctx.bit_depth = 0;
+  app_input.speed = 7;
   exec_name = argv[0];
 
   // start with default encoder configuration
-  aom_codec_err_t res =
-      aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, 0);
+  aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg,
+                                                     AOM_USAGE_REALTIME);
   if (res) {
     die("Failed to get config: %s\n", aom_codec_err_to_string(res));
   }
@@ -1071,10 +1141,13 @@ int main(int argc, const char **argv) {
   unsigned int width = cfg.g_w;
   unsigned int height = cfg.g_h;
 
-  if (ts_number_layers !=
-          mode_to_num_temporal_layers[app_input.layering_mode] ||
-      ss_number_layers != mode_to_num_spatial_layers[app_input.layering_mode]) {
-    die("Number of layers doesn't match layering mode.");
+  if (app_input.layering_mode >= 0) {
+    if (ts_number_layers !=
+            mode_to_num_temporal_layers[app_input.layering_mode] ||
+        ss_number_layers !=
+            mode_to_num_spatial_layers[app_input.layering_mode]) {
+      die("Number of layers doesn't match layering mode.");
+    }
   }
 
   // Y4M reader has its own allocation.
@@ -1109,20 +1182,16 @@ int main(int argc, const char **argv) {
     svc_params.framerate_factor[2] = 1;
   }
 
-  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
-  set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
-
   if (app_input.input_ctx.file_type == FILE_TYPE_Y4M) {
-    if (app_input.input_ctx.width != cfg.g_w ||
-        app_input.input_ctx.height != cfg.g_h) {
-      die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
-    }
-    if (app_input.input_ctx.framerate.numerator != cfg.g_timebase.den ||
-        app_input.input_ctx.framerate.denominator != cfg.g_timebase.num) {
-      die("Incorrect framerate: numerator %d denominator %d",
-          cfg.g_timebase.num, cfg.g_timebase.den);
-    }
+    // Override these settings with the info from Y4M file.
+    cfg.g_w = app_input.input_ctx.width;
+    cfg.g_h = app_input.input_ctx.height;
+    // g_timebase is the reciprocal of frame rate.
+    cfg.g_timebase.num = app_input.input_ctx.framerate.denominator;
+    cfg.g_timebase.den = app_input.input_ctx.framerate.numerator;
   }
+  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
+  set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
 
   AvxVideoInfo info;
   info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
@@ -1162,6 +1231,12 @@ int main(int argc, const char **argv) {
   if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
     die("Failed to initialize encoder");
 
+#if CONFIG_AV1_DECODER
+  if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) {
+    die("Failed to initialize decoder");
+  }
+#endif
+
   aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed);
   aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0);
   aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
@@ -1172,6 +1247,7 @@ int main(int argc, const char **argv) {
   aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3);
+  aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1);
   aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS,
                     cfg.g_threads ? get_msb(cfg.g_threads) : 0);
@@ -1196,8 +1272,8 @@ int main(int argc, const char **argv) {
     svc_params.scaling_factor_num[1] = 1;
     svc_params.scaling_factor_den[1] = 2;
   }
-
   aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params);
+  // TODO(aomedia:3032): Configure KSVC in fixed mode.
 
   // This controls the maximum target size of the key frame.
   // For generating smaller key frames, use a smaller max_intra_size_pct
@@ -1220,15 +1296,34 @@ int main(int argc, const char **argv) {
       const aom_codec_cx_pkt_t *pkt;
       int layer = 0;
 
-      // Set the reference/update flags, layer_id, and reference_map
-      // buffer index.
-      set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
-                        &ref_frame_config, &use_svc_control, slx, is_key_frame,
-                        (app_input.layering_mode == 10));
-      aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
-      if (use_svc_control)
-        aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
-                          &ref_frame_config);
+      // For flexible mode:
+      if (app_input.layering_mode >= 0) {
+        // Set the reference/update flags, layer_id, and reference_map
+        // buffer index.
+        set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
+                          &ref_frame_config, &use_svc_control, slx,
+                          is_key_frame, (app_input.layering_mode == 10));
+        aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+        if (use_svc_control)
+          aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+                            &ref_frame_config);
+      } else {
+        // Only up to 3 temporal layers supported in fixed mode.
+        // Only need to set spatial and temporal layer_id: reference
+        // prediction, refresh, and buffer_idx are set internally.
+        layer_id.spatial_layer_id = slx;
+        layer_id.temporal_layer_id = 0;
+        if (ts_number_layers == 2) {
+          layer_id.temporal_layer_id = (frame_cnt % 2) != 0;
+        } else if (ts_number_layers == 3) {
+          if (frame_cnt % 2 != 0)
+            layer_id.temporal_layer_id = 2;
+          else if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0))
+            layer_id.temporal_layer_id = 1;
+        }
+        aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+      }
+
       if (set_err_resil_frame) {
         // Set error_resilient per frame: off/0 for base layer and
         // on/1 for enhancement layer frames.
@@ -1332,14 +1427,31 @@ int main(int argc, const char **argv) {
                 sum_bitrate2 = 0.0;
               }
             }
+
+#if CONFIG_AV1_DECODER
+            if (aom_codec_decode(&decoder, pkt->data.frame.buf,
+                                 (unsigned int)pkt->data.frame.sz, NULL))
+              die_codec(&decoder, "Failed to decode frame.");
+#endif
+
             break;
           default: break;
         }
       }
+#if CONFIG_AV1_DECODER
+      // Don't look for mismatch on top spatial and top temporal layers as they
+      // are non reference frames.
+      if ((ss_number_layers > 1 || ts_number_layers > 1) &&
+          !(layer_id.temporal_layer_id > 0 &&
+            layer_id.temporal_layer_id == (int)ts_number_layers - 1)) {
+        test_decode(&codec, &decoder, frame_cnt, &mismatch_seen);
+      }
+#endif
     }  // loop over spatial layers
     ++frame_cnt;
     pts += frame_duration;
   }
+
   close_input_file(&(app_input.input_ctx));
   printout_rate_control_summary(&rc, frame_cnt, ss_number_layers,
                                 ts_number_layers);
@@ -1358,6 +1470,15 @@ int main(int argc, const char **argv) {
 
   if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
+#if CONFIG_INTERNAL_STATS
+  if (mismatch_seen) {
+    fprintf(stats_file, "First mismatch occurred in frame %d\n", mismatch_seen);
+  } else {
+    fprintf(stats_file, "No mismatch detected in recon buffers\n");
+  }
+  fclose(stats_file);
+#endif
+
   // Try to rewrite the output file headers with the actual frame count.
   for (i = 0; i < ss_number_layers * ts_number_layers; ++i)
     aom_video_writer_close(outfile[i]);
diff --git a/third_party/libaom/source/libaom/test/active_map_test.cc b/third_party/libaom/source/libaom/test/active_map_test.cc
index 4e30f55f81..2bbc3b64fb 100644
--- a/third_party/libaom/source/libaom/test/active_map_test.cc
+++ b/third_party/libaom/source/libaom/test/active_map_test.cc
@@ -38,6 +38,9 @@ class ActiveMapTest
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
     } else if (video->frame() == 3) {
       aom_active_map_t map = aom_active_map_t();
       /* clang-format off */
@@ -87,14 +90,6 @@ class ActiveMapTest
 
 TEST_P(ActiveMapTest, Test) { DoTest(); }
 
-class ActiveMapTestLarge : public ActiveMapTest {};
-
-TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
-
-AV1_INSTANTIATE_TEST_SUITE(ActiveMapTestLarge,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Range(0, 5));
-
 AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest,
                            ::testing::Values(::libaom_test::kRealTime),
                            ::testing::Range(5, 9));
diff --git a/third_party/libaom/source/libaom/test/altref_test.cc b/third_party/libaom/source/libaom/test/altref_test.cc
index 1334b4af57..002a206967 100644
--- a/third_party/libaom/source/libaom/test/altref_test.cc
+++ b/third_party/libaom/source/libaom/test/altref_test.cc
@@ -133,9 +133,7 @@ const gfIntervalParam gfTestParams[] = {
   { ::libaom_test::kTwoPassGood, 5, 10 },
   { ::libaom_test::kTwoPassGood, 8, 16 },
   { ::libaom_test::kTwoPassGood, 16, 32 },
-  // disabled below test case because it causes failure
-  // TODO(anyone): enable below test case once issue is fixed.
-  // { ::libaom_test::kTwoPassGood, 20, 32 },
+  { ::libaom_test::kTwoPassGood, 20, 32 },
 };
 
 // This class is used to test if the gf interval bounds configured by the user
diff --git a/third_party/libaom/source/libaom/test/aom_image_test.cc b/third_party/libaom/source/libaom/test/aom_image_test.cc
new file mode 100644
index 0000000000..7ff82d7273
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/aom_image_test.cc
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(AomImageTest, AomImgWrapInvalidAlign) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  aom_image_t img;
+  // Set img_data and img_data_owner to junk values. aom_img_wrap() should
+  // not read these values on failure.
+  img.img_data = (unsigned char *)"";
+  img.img_data_owner = 1;
+
+  aom_img_fmt_t format = AOM_IMG_FMT_I444;
+  // 'align' must be a power of 2 but is not. This causes the aom_img_wrap()
+  // call to fail. The test verifies we do not read the junk values in 'img'.
+  unsigned int align = 31;
+  EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr);
+}
diff --git a/third_party/libaom/source/libaom/test/aq_segment_test.cc b/third_party/libaom/source/libaom/test/aq_segment_test.cc
index 4e52b55dbe..b4a8b612bf 100644
--- a/third_party/libaom/source/libaom/test/aq_segment_test.cc
+++ b/third_party/libaom/source/libaom/test/aq_segment_test.cc
@@ -19,6 +19,13 @@
 
 namespace {
 
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
 class AqSegmentTest
     : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
                                                  int>,
@@ -40,6 +47,11 @@ class AqSegmentTest
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+      if (mode_ == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      }
     }
   }
 
@@ -69,10 +81,7 @@ class AqSegmentTest
 // 3-cyclic_refresh_aq) encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
 
-class AqSegmentTestLarge : public AqSegmentTest {};
-
-TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
-
+#if !CONFIG_REALTIME_ONLY
 // Validate that this delta q mode
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
@@ -84,13 +93,18 @@ TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
+#endif
 
-AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest,
-                           ::testing::Values(::libaom_test::kRealTime,
-                                             ::libaom_test::kOnePassGood),
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest, ::testing::ValuesIn(kTestModeParams),
                            ::testing::Range(5, 9), ::testing::Range(0, 4));
+
+#if !CONFIG_REALTIME_ONLY
+class AqSegmentTestLarge : public AqSegmentTest {};
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
+
 AV1_INSTANTIATE_TEST_SUITE(AqSegmentTestLarge,
-                           ::testing::Values(::libaom_test::kRealTime,
-                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(::libaom_test::kOnePassGood),
                            ::testing::Range(3, 5), ::testing::Range(0, 4));
+#endif
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/arf_freq_test.cc b/third_party/libaom/source/libaom/test/arf_freq_test.cc
index 0bf47e6ec4..d12f5ccee6 100644
--- a/third_party/libaom/source/libaom/test/arf_freq_test.cc
+++ b/third_party/libaom/source/libaom/test/arf_freq_test.cc
@@ -56,9 +56,13 @@ const TestVideoParam kTestVectors[] = {
 };
 
 const TestEncodeParam kEncodeVectors[] = {
-  { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 },
-  { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 },
-  { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 },
+#if CONFIG_REALTIME_ONLY
+  { ::libaom_test::kRealTime, 5 },
+#else
+  { ::libaom_test::kRealTime, 5 },    { ::libaom_test::kOnePassGood, 2 },
+  { ::libaom_test::kOnePassGood, 5 }, { ::libaom_test::kTwoPassGood, 1 },
+  { ::libaom_test::kTwoPassGood, 2 }, { ::libaom_test::kTwoPassGood, 5 },
+#endif
 };
 
 const int kMinArfVectors[] = {
diff --git a/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc b/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc
index a1c5746637..65300140ba 100644
--- a/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc
+++ b/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc
@@ -293,8 +293,8 @@ class ConvolveScaleTestBase : public ::testing::Test {
       convolve_params_.do_average = do_average;
     } else {
       convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
-      convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
-      convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
+      convolve_params_.fwd_offset = quant_dist_lookup_table[j][i];
+      convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i];
       convolve_params_.is_compound = is_compound;
       convolve_params_.do_average = do_average;
     }
diff --git a/third_party/libaom/source/libaom/test/av1_convolve_test.cc b/third_party/libaom/source/libaom/test/av1_convolve_test.cc
index 0c902808ad..4d61f02298 100644
--- a/third_party/libaom/source/libaom/test/av1_convolve_test.cc
+++ b/third_party/libaom/source/libaom/test/av1_convolve_test.cc
@@ -1172,8 +1172,8 @@ std::vector<CompoundParam> GetCompoundParams() {
   result.push_back(CompoundParam(false, 0, 0));
   for (int k = 0; k < 2; ++k) {
     for (int l = 0; l < 4; ++l) {
-      result.push_back(CompoundParam(true, quant_dist_lookup_table[k][l][0],
-                                     quant_dist_lookup_table[k][l][1]));
+      result.push_back(CompoundParam(true, quant_dist_lookup_table[l][k],
+                                     quant_dist_lookup_table[l][1 - k]));
     }
   }
   return result;
diff --git a/third_party/libaom/source/libaom/test/av1_external_partition_test.cc b/third_party/libaom/source/libaom/test/av1_external_partition_test.cc
new file mode 100644
index 0000000000..4fe61c7843
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/av1_external_partition_test.cc
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fstream>
+#include <new>
+#include <sstream>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+#include "av1/common/blockd.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+#if CONFIG_AV1_ENCODER
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+constexpr int kFrameNum = 8;
+constexpr int kVersion = 1;
+
+typedef struct TestData {
+  int version = kVersion;
+} TestData;
+
+typedef struct ToyModel {
+  TestData *data;
+  aom_ext_part_config_t config;
+  aom_ext_part_funcs_t funcs;
+} ToyModel;
+
+// Feature files written during encoding, as defined in partition_strategy.c.
+std::string feature_file_names[] = {
+  "feature_before_partition_none",
+  "feature_before_partition_none_prune_rect",
+  "feature_after_partition_none_prune",
+  "feature_after_partition_none_terminate",
+  "feature_after_partition_split_terminate",
+  "feature_after_partition_split_prune_rect",
+  "feature_after_partition_rect",
+  "feature_after_partition_ab",
+};
+
+// Files written here in the test, where the feature data is received
+// from the API.
+std::string test_feature_file_names[] = {
+  "test_feature_before_partition_none",
+  "test_feature_before_partition_none_prune_rect",
+  "test_feature_after_partition_none_prune",
+  "test_feature_after_partition_none_terminate",
+  "test_feature_after_partition_split_terminate",
+  "test_feature_after_partition_split_prune_rect",
+  "test_feature_after_partition_rect",
+  "test_feature_after_partition_ab",
+};
+
+static void write_features_to_file(const float *features,
+                                   const int feature_size, const int id) {
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s",
+           test_feature_file_names[id].c_str());
+  FILE *pfile = fopen(filename, "a");
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+aom_ext_part_status_t ext_part_create_model(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  TestData *received_data = reinterpret_cast<TestData *>(priv);
+  EXPECT_EQ(received_data->version, kVersion);
+  ToyModel *toy_model = new (std::nothrow) ToyModel;
+  EXPECT_NE(toy_model, nullptr);
+  toy_model->data = received_data;
+  *ext_part_model = toy_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_create_model_test(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  (void)priv;
+  (void)ext_part_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  (void)ext_part_model;
+  (void)part_features;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features_test(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  (void)ext_part_model;
+  if (part_features->id == FEATURE_BEFORE_PART_NONE) {
+    write_features_to_file(part_features->before_part_none.f, SIZE_DIRECT_SPLIT,
+                           0);
+  } else if (part_features->id == FEATURE_BEFORE_PART_NONE_PART2) {
+    write_features_to_file(part_features->before_part_none.f_part2,
+                           SIZE_PRUNE_PART, 1);
+  } else if (part_features->id == FEATURE_AFTER_PART_NONE) {
+    write_features_to_file(part_features->after_part_none.f, SIZE_PRUNE_NONE,
+                           2);
+  } else if (part_features->id == FEATURE_AFTER_PART_NONE_PART2) {
+    write_features_to_file(part_features->after_part_none.f_terminate,
+                           SIZE_TERM_NONE, 3);
+  } else if (part_features->id == FEATURE_AFTER_PART_SPLIT) {
+    write_features_to_file(part_features->after_part_split.f_terminate,
+                           SIZE_TERM_SPLIT, 4);
+  } else if (part_features->id == FEATURE_AFTER_PART_SPLIT_PART2) {
+    write_features_to_file(part_features->after_part_split.f_prune_rect,
+                           SIZE_PRUNE_RECT, 5);
+  } else if (part_features->id == FEATURE_AFTER_PART_RECT) {
+    write_features_to_file(part_features->after_part_rect.f, SIZE_PRUNE_AB, 6);
+  } else if (part_features->id == FEATURE_AFTER_PART_AB) {
+    write_features_to_file(part_features->after_part_ab.f, SIZE_PRUNE_4_WAY, 7);
+  }
+  return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  (void)ext_part_model;
+  (void)ext_part_decision;
+  return AOM_EXT_PART_ERROR;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats) {
+  (void)ext_part_model;
+  (void)ext_part_stats;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+    aom_ext_part_model_t ext_part_model) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  EXPECT_EQ(toy_model->data->version, kVersion);
+  delete toy_model;
+  return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ExternalPartitionTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+  virtual ~ExternalPartitionTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 4;
+    cfg_.rc_target_bitrate = 400;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  void SetExternalPartition(bool use_external_partition) {
+    use_external_partition_ = use_external_partition;
+  }
+
+  void SetTestSendFeatures(int test_send_features) {
+    test_send_features_ = test_send_features;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      aom_ext_part_funcs_t ext_part_funcs;
+      ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+      if (use_external_partition_) {
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+      }
+      if (test_send_features_ == 1) {
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features_test;
+      } else if (test_send_features_ == 0) {
+        ext_part_funcs.create_model = ext_part_create_model_test;
+        ext_part_funcs.send_features = ext_part_send_features;
+      }
+      ext_part_funcs.get_partition_decision = ext_part_get_partition_decision;
+      ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+      ext_part_funcs.delete_model = ext_part_delete_model;
+
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      if (use_external_partition_) {
+        encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+      }
+    }
+  }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  double psnr_;
+  unsigned int nframes_;
+  bool use_external_partition_ = false;
+  int test_send_features_ = -1;
+  TestData test_data_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is the baseline without external partition.
+// The second run is to get partition decisions from the toy model we defined.
+// Here, we let the partition decision return true for all stages.
+// In this case, the external partition doesn't alter the original encoder
+// behavior. So we expect the same encoding results.
+TEST_P(ExternalPartitionTest, EncodeMatch) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+// Encode twice to compare generated feature files.
+// The first run let the encoder write partition features to file.
+// The second run calls send partition features function to send features to
+// the external model, and we write them to file.
+// The generated files should match each other.
+TEST_P(ExternalPartitionTest, SendFeatures) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(true);
+  SetTestSendFeatures(0);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  SetExternalPartition(true);
+  SetTestSendFeatures(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare feature files by reading them into strings.
+  for (int i = 0; i < 8; ++i) {
+    std::ifstream base_file(feature_file_names[i]);
+    std::stringstream base_stream;
+    base_stream << base_file.rdbuf();
+    std::string base_string = base_stream.str();
+
+    std::ifstream test_file(test_feature_file_names[i]);
+    std::stringstream test_stream;
+    test_stream << test_file.rdbuf();
+    std::string test_string = test_stream.str();
+
+    EXPECT_STREQ(base_string.c_str(), test_string.c_str());
+  }
+
+  // Remove files.
+  std::string command("rm -f feature_* test_feature_*");
+  system(command.c_str());
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(4));  // cpu_used
+
+}  // namespace
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_ENCODER
diff --git a/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc b/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc
index 0e7eb09f2a..d124330ff8 100644
--- a/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc
+++ b/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc
@@ -362,6 +362,78 @@ TEST_P(AV1FwdTxfm2dTest, match) {
 TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
   AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
 }
+TEST(AV1FwdTxfm2dTest, DCTScaleTest) {
+  BitDepthInfo bd_info;
+  bd_info.bit_depth = 8;
+  bd_info.use_highbitdepth_buf = 0;
+  DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+  const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+  const int stride_list[4] = { 4, 8, 16, 32 };
+  const int ref_scale_list[4] = { 64, 64, 64, 16 };
+
+  for (int i = 0; i < 4; i++) {
+    TX_SIZE tx_size = tx_size_list[i];
+    int stride = stride_list[i];
+    int array_size = stride * stride;
+
+    for (int i = 0; i < array_size; i++) {
+      src_diff[i] = 8;
+      coeff[i] = 0;
+    }
+
+    av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, stride,
+                   coeff);
+
+    double input_sse = 0;
+    double output_sse = 0;
+    for (int i = 0; i < array_size; i++) {
+      input_sse += pow(src_diff[i], 2);
+      output_sse += pow(coeff[i], 2);
+    }
+
+    double scale = output_sse / input_sse;
+
+    EXPECT_NEAR(scale, ref_scale_list[i], 5);
+  }
+}
+TEST(AV1FwdTxfm2dTest, HadamardScaleTest) {
+  BitDepthInfo bd_info;
+  bd_info.bit_depth = 8;
+  bd_info.use_highbitdepth_buf = 0;
+  DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+  const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+  const int stride_list[4] = { 4, 8, 16, 32 };
+  const int ref_scale_list[4] = { 1, 64, 64, 16 };
+
+  for (int i = 0; i < 4; i++) {
+    TX_SIZE tx_size = tx_size_list[i];
+    int stride = stride_list[i];
+    int array_size = stride * stride;
+
+    for (int i = 0; i < array_size; i++) {
+      src_diff[i] = 8;
+      coeff[i] = 0;
+    }
+
+    av1_quick_txfm(/*use_hadamard=*/1, tx_size, bd_info, src_diff, stride,
+                   coeff);
+
+    double input_sse = 0;
+    double output_sse = 0;
+    for (int i = 0; i < array_size; i++) {
+      input_sse += pow(src_diff[i], 2);
+      output_sse += pow(coeff[i], 2);
+    }
+
+    double scale = output_sse / input_sse;
+
+    EXPECT_NEAR(scale, ref_scale_list[i], 5);
+  }
+}
 using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;
@@ -580,8 +652,10 @@ using ::testing::ValuesIn;
 #if HAVE_SSE4_1
 static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
   TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
-  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
-  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32,
+#if !CONFIG_REALTIME_ONLY
+  TX_4X16, TX_16X4, TX_8X32,  TX_32X8,  TX_16X64, TX_64X16,
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
diff --git a/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc b/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc
index a576c0ffed..165abc9483 100644
--- a/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc
+++ b/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc
@@ -210,6 +210,12 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvTxfm2d);
 void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
                                              int run_times, int bit_depth_,
                                              int gt_int16) {
+#if CONFIG_REALTIME_ONLY
+  if (tx_size_ == TX_4X16 || tx_size_ == TX_16X4 || tx_size_ == TX_8X32 ||
+      tx_size_ == TX_32X8 || tx_size_ == TX_16X64 || tx_size_ == TX_64X16) {
+    return;
+  }
+#endif
   FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
   TxfmParam txfm_param;
   const int BLK_WIDTH = 64;
diff --git a/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc b/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc
index 3d06d2d6c5..058b8ce443 100644
--- a/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc
+++ b/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc
@@ -29,10 +29,15 @@ class BaseKeyValAPI : public testing::Test {
 #if CONFIG_AV1_ENCODER
     aom_codec_iface_t *iface_cx = aom_codec_av1_cx();
     aom_codec_enc_cfg_t enc_cfg;
-
+#if CONFIG_REALTIME_ONLY
+    const int usage = 1;
+#else
+    const int usage = 0;
+#endif
+    EXPECT_EQ(AOM_CODEC_OK,
+              aom_codec_enc_config_default(iface_cx, &enc_cfg, usage));
     EXPECT_EQ(AOM_CODEC_OK,
-              aom_codec_enc_config_default(iface_cx, &enc_cfg, 0));
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, 0));
+              aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, usage));
 #endif
 #if CONFIG_AV1_DECODER
     aom_codec_iface_t *iface_dx = aom_codec_av1_dx();
diff --git a/third_party/libaom/source/libaom/test/av1_quantize_test.cc b/third_party/libaom/source/libaom/test/av1_quantize_test.cc
index f0882c7099..bfb684effd 100644
--- a/third_party/libaom/source/libaom/test/av1_quantize_test.cc
+++ b/third_party/libaom/source/libaom/test/av1_quantize_test.cc
@@ -19,6 +19,7 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "av1/common/scan.h"
+#include "av1/encoder/av1_quantize.h"
 
 namespace {
 
@@ -207,6 +208,32 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1QuantizeTest);
 TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
 TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
+TEST(AV1QuantizeTest, QuantizeFpNoQmatrix) {
+  // Here we use a uniform quantizer as an example
+  const int16_t dequant_ptr[2] = { 78, 93 };  // quantize step
+  const int16_t round_ptr[2] = { 39, 46 };    // round ~= dequant / 2
+
+  // quant ~= 2^16 / dequant. This is a 16-bit fixed point representation of the
+  // inverse of quantize step.
+  const int16_t quant_ptr[2] = { 840, 704 };
+  int log_scale = 0;
+  int coeff_count = 4;
+  const tran_low_t coeff_ptr[4] = { -449, 624, -14, 24 };
+  const tran_low_t ref_qcoeff_ptr[4] = { -6, 7, 0, 0 };
+  const tran_low_t ref_dqcoeff_ptr[4] = { -468, 651, 0, 0 };
+  const int16_t scan[4] = { 0, 1, 2, 3 };
+  tran_low_t qcoeff_ptr[4];
+  tran_low_t dqcoeff_ptr[4];
+  int eob = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                       log_scale, scan, coeff_count, coeff_ptr,
+                                       qcoeff_ptr, dqcoeff_ptr);
+  EXPECT_EQ(eob, 2);
+  for (int i = 0; i < coeff_count; ++i) {
+    EXPECT_EQ(qcoeff_ptr[i], ref_qcoeff_ptr[i]);
+    EXPECT_EQ(dqcoeff_ptr[i], ref_dqcoeff_ptr[i]);
+  }
+}
+
 #if HAVE_SSE4_1
 const QuantizeFuncParams qfps[4] = {
   QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
diff --git a/third_party/libaom/source/libaom/test/block_test.cc b/third_party/libaom/source/libaom/test/block_test.cc
index 9cf5b020ef..74deee3f54 100644
--- a/third_party/libaom/source/libaom/test/block_test.cc
+++ b/third_party/libaom/source/libaom/test/block_test.cc
@@ -191,9 +191,17 @@ TEST_P(SuperBlockSizeTestLarge, SuperBlockSizeTest) {
       << "Failed for SB size " << superblock_size_;
 }
 
+const ::libaom_test::TestMode kTestModes[] = {
+#if CONFIG_REALTIME_ONLY
+  ::libaom_test::kRealTime
+#else
+  ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+  ::libaom_test::kTwoPassGood
+#endif
+};
+
 AV1_INSTANTIATE_TEST_SUITE(SuperBlockSizeTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModes),
                            ::testing::Values(AOM_SUPERBLOCK_SIZE_64X64,
                                              AOM_SUPERBLOCK_SIZE_128X128),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
diff --git a/third_party/libaom/source/libaom/test/coding_path_sync.cc b/third_party/libaom/source/libaom/test/coding_path_sync.cc
index 4c613dc03b..0eaa9dad8d 100644
--- a/third_party/libaom/source/libaom/test/coding_path_sync.cc
+++ b/third_party/libaom/source/libaom/test/coding_path_sync.cc
@@ -31,7 +31,11 @@ class CompressedSource {
     aom_codec_iface_t *algo = aom_codec_av1_cx();
 
     aom_codec_enc_cfg_t cfg;
+#if CONFIG_REALTIME_ONLY
+    aom_codec_enc_config_default(algo, &cfg, 1);
+#else
     aom_codec_enc_config_default(algo, &cfg, 0);
+#endif
 
     // force the quantizer, to reduce the sensitivity on encoding choices.
     // e.g, we don't want this test to break when the rate control is modified.
diff --git a/third_party/libaom/source/libaom/test/comp_avg_pred_test.h b/third_party/libaom/source/libaom/test/comp_avg_pred_test.h
index 7f73312c4e..f2fee6d434 100644
--- a/third_party/libaom/source/libaom/test/comp_avg_pred_test.h
+++ b/third_party/libaom/source/libaom/test/comp_avg_pred_test.h
@@ -117,8 +117,8 @@ class AV1DISTWTDCOMPAVGTest
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -160,8 +160,8 @@ class AV1DISTWTDCOMPAVGTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
@@ -226,10 +226,9 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset =
-                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
               dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[ii][jj][1];
+                  quant_dist_lookup_table[jj][1 - ii];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -282,8 +281,8 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
@@ -351,8 +350,8 @@ class AV1HighBDDISTWTDCOMPAVGTest
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -398,8 +397,8 @@ class AV1HighBDDISTWTDCOMPAVGTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
@@ -466,10 +465,9 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset =
-                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
               dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[ii][jj][1];
+                  quant_dist_lookup_table[jj][1 - ii];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -524,8 +522,8 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
     const int num_loops = 1000000000 / (in_w + in_h);
diff --git a/third_party/libaom/source/libaom/test/cpu_used_firstpass_test.cc b/third_party/libaom/source/libaom/test/cpu_used_firstpass_test.cc
new file mode 100644
index 0000000000..c970c1977d
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/cpu_used_firstpass_test.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const double kPsnrDiffThreshold = 0.1;
+const int kFirstPassCpuUsed[] = { 2, 4, 6 };
+
+class CpuUsedFirstpassTest : public ::libaom_test::CodecTestWithParam<int>,
+                             public ::libaom_test::EncoderTest {
+ protected:
+  CpuUsedFirstpassTest()
+      : EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(1)) {}
+  virtual ~CpuUsedFirstpassTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(::libaom_test::kTwoPassGood);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.g_lag_in_frames = 19;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int pass) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+
+    if (pass == 0)
+      cpu_used_ = first_pass_cpu_used_;
+    else
+      cpu_used_ = second_pass_cpu_used_;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrDiffThreshold() { return kPsnrDiffThreshold; }
+
+  void DoTest() {
+    libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+                                       cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                       0, 30);
+    const int size = sizeof(kFirstPassCpuUsed) / sizeof(kFirstPassCpuUsed[0]);
+    double ref_psnr;
+    double psnr_diff;
+
+    first_pass_cpu_used_ = second_pass_cpu_used_;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));  // same preset case ref_psnr
+    ref_psnr = GetAveragePsnr();
+
+    for (int i = 0; i < size; i++) {
+      first_pass_cpu_used_ = kFirstPassCpuUsed[i];
+      if (first_pass_cpu_used_ == second_pass_cpu_used_) continue;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      psnr_diff = abs(ref_psnr - GetAveragePsnr());
+      EXPECT_LT(psnr_diff, GetPsnrDiffThreshold())
+          << "first pass cpu used = " << first_pass_cpu_used_
+          << ", second pass cpu used = " << second_pass_cpu_used_;
+    }
+  }
+
+  int cpu_used_;
+  int first_pass_cpu_used_;
+  int second_pass_cpu_used_;
+  unsigned int nframes_;
+  double psnr_;
+};
+
+TEST_P(CpuUsedFirstpassTest, FirstPassTest) { DoTest(); }
+
+class CpuUsedFirstpassTestLarge : public CpuUsedFirstpassTest {};
+
+TEST_P(CpuUsedFirstpassTestLarge, FirstPassTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(CpuUsedFirstpassTestLarge,
+                           ::testing::Values(2));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(CpuUsedFirstpassTest,
+                           ::testing::Values(4, 6));  // cpu_used
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/datarate_test.cc b/third_party/libaom/source/libaom/test/datarate_test.cc
index 2ff074fe8c..71f8b0f37b 100644
--- a/third_party/libaom/source/libaom/test/datarate_test.cc
+++ b/third_party/libaom/source/libaom/test/datarate_test.cc
@@ -57,7 +57,9 @@ class DatarateTestLarge
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.4)
+    // FIXME(jingning): Lower this test threshold after vbr mode can render
+    // sufficiently accurate bit rate.
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.45)
         << " The datarate for the file is greater than target by too much!";
   }
 
diff --git a/third_party/libaom/source/libaom/test/datarate_test.h b/third_party/libaom/source/libaom/test/datarate_test.h
index 0396034874..1b0d515efa 100644
--- a/third_party/libaom/source/libaom/test/datarate_test.h
+++ b/third_party/libaom/source/libaom/test/datarate_test.h
@@ -63,6 +63,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
         encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
       }
     }
 
diff --git a/third_party/libaom/source/libaom/test/encode_api_test.cc b/third_party/libaom/source/libaom/test/encode_api_test.cc
index eb918460ae..70b0612ced 100644
--- a/third_party/libaom/source/libaom/test/encode_api_test.cc
+++ b/third_party/libaom/source/libaom/test/encode_api_test.cc
@@ -20,6 +20,12 @@
 
 namespace {
 
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
+#endif
+
 TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   aom_image_t img;
@@ -45,7 +51,7 @@ TEST(EncodeAPI, InvalidParams) {
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, NULL, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
             aom_codec_enc_config_default(iface, &cfg, 3));
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
 
@@ -63,13 +69,14 @@ TEST(EncodeAPI, InvalidControlId) {
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_ctx_t enc;
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&enc, -1, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&enc, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+#if !CONFIG_REALTIME_ONLY
 TEST(EncodeAPI, AllIntraMode) {
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_ctx_t enc;
@@ -93,5 +100,6 @@ TEST(EncodeAPI, AllIntraMode) {
   cfg.kf_max_dist = 1;
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
 }
+#endif
 
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc b/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc
index 6f52fd58ef..ad493e5ce0 100644
--- a/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc
+++ b/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc
@@ -19,11 +19,17 @@
 
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
+#include "config/aom_config.h"
 
 namespace {
 
 // Dummy buffer of zero samples.
 constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 };
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
+#endif
 
 TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
   // The image has only one tile and the tile is two AV1 superblocks wide.
@@ -37,7 +43,7 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   cfg.g_threads = 2;
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
@@ -49,6 +55,7 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+#if !CONFIG_REALTIME_ONLY
 TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
   // The image has only one tile and the tile is two AV1 superblocks wide.
   // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
@@ -72,6 +79,7 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
+#endif
 
 TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
   // The image has only one tile and the tile is one AV1 superblock tall.
@@ -85,7 +93,7 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   cfg.g_threads = 2;
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
@@ -97,6 +105,7 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+#if !CONFIG_REALTIME_ONLY
 TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
   // The image has only one tile and the tile is one AV1 superblock tall.
   // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
@@ -120,5 +129,5 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
-
+#endif
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/encode_test_driver.cc b/third_party/libaom/source/libaom/test/encode_test_driver.cc
index 058e08e5d7..4a8801f06c 100644
--- a/third_party/libaom/source/libaom/test/encode_test_driver.cc
+++ b/third_party/libaom/source/libaom/test/encode_test_driver.cc
@@ -226,18 +226,18 @@ void EncoderTest::RunLoop(VideoSource *video) {
         encoder->EncodeFrame(video, frame_flags_);
 
         CxDataIterator iter = encoder->GetCxData();
+        bool has_cxdata = false;
 
 #if CONFIG_AV1_DECODER
-        bool has_cxdata = false;
         bool has_dxdata = false;
 #endif
         while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
           pkt = MutateEncoderOutputHook(pkt);
           again = true;
           switch (pkt->kind) {
-            case AOM_CODEC_CX_FRAME_PKT:
-#if CONFIG_AV1_DECODER
+            case AOM_CODEC_CX_FRAME_PKT:  //
               has_cxdata = true;
+#if CONFIG_AV1_DECODER
               if (decoder.get() != NULL && DoDecode()) {
                 aom_codec_err_t res_dec;
                 if (DoDecodeInvisible()) {
@@ -267,21 +267,27 @@ void EncoderTest::RunLoop(VideoSource *video) {
             default: break;
           }
         }
-#if CONFIG_AV1_DECODER
-        if (has_dxdata && has_cxdata) {
+        if (has_cxdata) {
           const aom_image_t *img_enc = encoder->GetPreviewFrame();
-          DxDataIterator dec_iter = decoder->GetDxData();
-          const aom_image_t *img_dec = dec_iter.Next();
-          if (img_enc && img_dec) {
-            const bool res =
-                compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
-            if (!res) {  // Mismatch
-              MismatchHook(img_enc, img_dec);
+          if (img_enc) {
+            CalculateFrameLevelSSIM(video->img(), img_enc, cfg_.g_bit_depth,
+                                    cfg_.g_input_bit_depth);
+          }
+#if CONFIG_AV1_DECODER
+          if (has_dxdata) {
+            DxDataIterator dec_iter = decoder->GetDxData();
+            const aom_image_t *img_dec = dec_iter.Next();
+            if (img_enc && img_dec) {
+              const bool res =
+                  compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
+              if (!res) {  // Mismatch
+                MismatchHook(img_enc, img_dec);
+              }
             }
+            if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
           }
-          if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
-        }
 #endif
+        }
         if (!Continue()) break;
       }  // Loop over spatial layers
     }
diff --git a/third_party/libaom/source/libaom/test/encode_test_driver.h b/third_party/libaom/source/libaom/test/encode_test_driver.h
index 5da3ac5d0b..468a41bef3 100644
--- a/third_party/libaom/source/libaom/test/encode_test_driver.h
+++ b/third_party/libaom/source/libaom/test/encode_test_driver.h
@@ -134,6 +134,11 @@ class Encoder {
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct aom_ext_part_funcs *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
 #if CONFIG_AV1_ENCODER
   void Control(int ctrl_id, aom_active_map_t *arg) {
     const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
@@ -216,6 +221,12 @@ class EncoderTest {
   // Hook to be called on every first pass stats packet.
   virtual void StatsPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
 
+  // Calculates SSIM at frame level.
+  virtual void CalculateFrameLevelSSIM(const aom_image_t * /*img_src*/,
+                                       const aom_image_t * /*img_enc*/,
+                                       aom_bit_depth_t /*bit_depth*/,
+                                       unsigned int /*input_bit_depth*/) {}
+
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
     return !(::testing::Test::HasFatalFailure() || abort_);
diff --git a/third_party/libaom/source/libaom/test/encodemb_test.cc b/third_party/libaom/source/libaom/test/encodemb_test.cc
new file mode 100644
index 0000000000..4c725c7dea
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/encodemb_test.cc
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+// Reorders 'qcoeff_lexico', which is in lexicographic order (row by row), into
+// scan order (zigzag) in 'qcoeff_scan'.
+void ToScanOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_lexico,
+                 tran_low_t *qcoeff_scan) {
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  for (int i = 0; i < max_eob; ++i) {
+    qcoeff_scan[i] = qcoeff_lexico[scan_order->scan[i]];
+  }
+}
+
+// Reorders 'qcoeff_scan', which is in scan order (zigzag), into lexicographic
+// order (row by row) in 'qcoeff_lexico'.
+void ToLexicoOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_scan,
+                   tran_low_t *qcoeff_lexico) {
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  for (int i = 0; i < max_eob; ++i) {
+    qcoeff_lexico[scan_order->scan[i]] = qcoeff_scan[i];
+  }
+}
+
+// Runs coefficient dropout on 'qcoeff_scan'.
+void Dropout(TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before,
+             int dropout_num_after, tran_low_t *qcoeff_scan) {
+  tran_low_t qcoeff[MAX_TX_SQUARE];
+  // qcoeff_scan is assumed to be in scan order, since tests are easier to
+  // understand this way, but av1_dropout_qcoeff expects coeffs in lexico order
+  // so we convert to lexico then back to scan afterwards.
+  ToLexicoOrder(tx_size, tx_type, qcoeff_scan, qcoeff);
+
+  const int max_eob = av1_get_max_eob(tx_size);
+  const int kDequantFactor = 10;
+  tran_low_t dqcoeff[MAX_TX_SQUARE];
+  for (int i = 0; i < max_eob; ++i) {
+    dqcoeff[i] = qcoeff[i] * kDequantFactor;
+  }
+
+  uint16_t eob = max_eob;
+  while (eob > 0 && qcoeff_scan[eob - 1] == 0) --eob;
+
+  MACROBLOCK mb;
+  const int kPlane = 0;
+  const int kBlock = 0;
+  memset(&mb, 0, sizeof(mb));
+  uint16_t eobs[] = { eob };
+  mb.plane[kPlane].eobs = eobs;
+  mb.plane[kPlane].qcoeff = qcoeff;
+  mb.plane[kPlane].dqcoeff = dqcoeff;
+  uint8_t txb_entropy_ctx[1];
+  mb.plane[kPlane].txb_entropy_ctx = txb_entropy_ctx;
+
+  av1_dropout_qcoeff_num(&mb, kPlane, kBlock, tx_size, tx_type,
+                         dropout_num_before, dropout_num_after);
+
+  ToScanOrder(tx_size, tx_type, qcoeff, qcoeff_scan);
+
+  // Check updated eob value is valid.
+  uint16_t new_eob = max_eob;
+  while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob;
+  EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]);
+
+  // Check qqcoeff is still valid.
+  for (int i = 0; i < max_eob; ++i) {
+    EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]);
+  }
+}
+
+void ExpectArrayEq(tran_low_t *actual, std::vector<tran_low_t> expected) {
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], actual[i]) << "Arrays differ at index " << i;
+  }
+}
+
+static constexpr TX_TYPE kTxType = DCT_DCT;
+
+TEST(DropoutTest, KeepsLargeCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Large isolated coeffs should be preserved.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 0, 0, 42, 0,    // should be kept
+                               0, 0, 0, 0, 0, 0, 0,  0,    //
+                               0, 0, 0, 0, 0, 0, 0,  -30,  // should be kept
+                               0, 0, 0, 0, 0, 0, 0,  0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 42, 0,    //
+                               0, 0, 0, 0, 0, 0, 0,  0,    //
+                               0, 0, 0, 0, 0, 0, 0,  -30,  //
+                               0, 0, 0, 0, 0, 0, 0,  0 });
+}
+
+TEST(DropoutTest, RemovesSmallIsolatedCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small isolated coeffs should be removed.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1,  0, 0, 0,  // should be removed
+                               0, 0, 0, 0, 0,  0, 0, 0,  //
+                               0, 0, 0, 0, -2, 0, 0, 0,  // should be removed
+                               0, 0, 0, 0, 0,  0, 0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsAmongLargeOnes) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small coeffs that are not isolated (not enough zeros before/after should be
+  // kept).
+  tran_low_t qcoeff_scan[] = {
+    1, 0,  0, 0,  -5, 0, 0, -1,  // should be kept
+    0, 0,  0, 10, 0,  0, 2, 0,   // should be kept
+    0, 0,  0, 0,  0,  0, 0, 0,   //
+    0, -2, 0, 0,  0,  0, 0, 0    // should be removed
+  };                             // should be removed
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 1, 0, 0, 0,  -5, 0, 0, -1,  //
+                               0, 0, 0, 10, 0,  0, 2, 0,   //
+                               0, 0, 0, 0,  0,  0, 0, 0,   //
+                               0, 0, 0, 0,  0,  0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsCloseToStartOrEnd) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small coeffs that are too close to the beginning or end of the block
+  // should also be kept (not enough zeroes before/after).
+  tran_low_t qcoeff_scan[] = { 0, 0, -1, 0,  0, 0, 0,  0,  // should be kept
+                               0, 0, 0,  10, 0, 0, 0,  0,  // should be kept
+                               0, 0, 0,  2,  0, 0, 0,  0,  // should be removed
+                               0, 0, 0,  0,  0, 0, -1, 0 };  // should be kept
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, -1, 0,  0, 0, 0,  0,  //
+                               0, 0, 0,  10, 0, 0, 0,  0,  //
+                               0, 0, 0,  0,  0, 0, 0,  0,  //
+                               0, 0, 0,  0,  0, 0, -1, 0 });
+}
+
+TEST(DropoutTest, RemovesSmallClusterOfCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small clusters (<= kDropoutContinuityMax) of small coeffs should be
+  // removed.
+  tran_low_t qcoeff_scan_two[] = {
+    0, 0, 0, 0, 1, 0, 0, -1,  // should be removed
+    0, 0, 0, 0, 0, 0, 0, 0,   //
+    0, 0, 0, 0, 0, 0, 1, 0,   // should be removed
+    0, 0, 0, 0, 0, 0, 0, 0
+  };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after,
+          qcoeff_scan_two);
+  ExpectArrayEq(qcoeff_scan_two, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsLargeClusterOfCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Large clusters (> kDropoutContinuityMax) of small coeffs should be kept.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0,  1, -1,  // should be kept
+                               0, 0, 0, 0, 0, 0,  0, 0,   //
+                               0, 0, 0, 0, 0, -2, 0, 0,   // should be removed
+                               0, 0, 0, 0, 0, 0,  0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 1, 0, 1, -1,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,   //
+                               0, 0, 0, 0, 0, 0, 0, 0,   //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, NumBeforeLargerThanNumAfter) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 2;
+  // The second coeff (-2) doesn't seem to meet the dropout_num_before
+  // criteria. But since the first coeff (1) will be dropped, it will meet
+  // the criteria and should be dropped too.
+  tran_low_t qcoeff_scan[] = { 0,  0, 0, 0, 1, 0, 0, 0,  // should be removed
+                               -2, 0, 0, 0, 0, 0, 0, 0,  // should be removed
+                               0,  0, 0, 0, 0, 0, 0, 0,  //
+                               0,  0, 0, 0, 0, 0, 0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+// More complex test combining other test cases.
+TEST(DropoutTest, ComplexTest) {
+  const TX_SIZE tx_size = TX_8X8;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 2;
+  tran_low_t qcoeff_scan[] = { 1, 12, 0,  0,   0, 0, 1,  0,   //
+                               0, 0,  0,  -12, 0, 0, 0,  1,   //
+                               0, 0,  -2, 0,   1, 0, 0,  1,   //
+                               0, 0,  0,  0,   5, 0, -1, 0,   //
+                               0, 0,  0,  1,   0, 0, 0,  -1,  //
+                               0, 0,  0,  0,   2, 0, 0,  0,   //
+                               0, 1,  0,  0,   0, 5, 0,  0,   //
+                               0, 0,  1,  1,   0, 0, 0,  -2 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 1, 12, 0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  -12, 0, 0, 0,  1,  //
+                               0, 0,  -2, 0,   1, 0, 0,  1,  //
+                               0, 0,  0,  0,   5, 0, -1, 0,  //
+                               0, 0,  0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  0,   0, 5, 0,  0,  //
+                               0, 0,  0,  0,   0, 0, 0,  -2 });
+}
+
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/end_to_end_psnr_test.cc b/third_party/libaom/source/libaom/test/end_to_end_psnr_test.cc
new file mode 100644
index 0000000000..5574c1a909
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/end_to_end_psnr_test.cc
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kWidth = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const unsigned int kCqLevel = 18;
+// List of psnr thresholds for speed settings 0-8 and 4 encoding modes
+const double kPsnrThreshold[][4] = {
+  { 35.7, 44.4, 39.5, 41.9 }, { 35.7, 44.4, 39.5, 41.9 },
+  { 35.7, 44.4, 39.4, 41.9 }, { 35.7, 44.4, 39.1, 41.8 },
+  { 35.6, 44.4, 39.1, 41.8 }, { 35.0, 44.3, 38.7, 41.8 },
+  { 35.0, 44.3, 38.7, 41.3 }, { 35.0, 44.3, 38.7, 40.8 },
+  { 35.0, 44.3, 38.7, 40.8 }
+};
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// Encoding modes tested
+const libaom_test::TestMode kEncodingModeVectors[] = {
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
+  ::libaom_test::kRealTime,
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class EndToEndTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {}
+
+  virtual ~EndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+        encoding_mode_ == ::libaom_test::kTwoPassGood) {
+      cfg_.g_lag_in_frames = 5;
+    } else if (encoding_mode_ == ::libaom_test::kRealTime) {
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass.
+      if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+          encoding_mode_ == ::libaom_test::kTwoPassGood) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[cpu_used_][encoding_mode_];
+  }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video;
+    if (is_extension_y4m(test_video_param_.filename)) {
+      video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                  kFrames));
+    } else {
+      video.reset(new libaom_test::YUVVideoSource(
+          test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+          kFramerate, 1, 0, kFrames));
+    }
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+};
+
+class EndToEndTestLarge : public EndToEndTest {};
+
+class EndToEndAllIntraTestLarge : public EndToEndTest {};
+
+class EndToEndAllIntraTest : public EndToEndTest {};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndAllIntraTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndAllIntraTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(kTestVectors[2]),  // 444
+                           ::testing::Values(3));               // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/end_to_end_ssim_test.cc b/third_party/libaom/source/libaom/test/end_to_end_ssim_test.cc
new file mode 100644
index 0000000000..1e638d7b45
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/end_to_end_ssim_test.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+                                  83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value.
+class EndToEndSSIMTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndSSIMTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+        ssim_(0.0) {}
+
+  ~EndToEndSSIMTest() override {}
+
+  void SetUp() override { InitializeConfig(encoding_mode_); }
+
+  void BeginPassHook(unsigned int) override {
+    nframes_ = 0;
+    ssim_ = 0.0;
+  }
+
+  void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+                               const aom_image_t *img_enc,
+                               aom_bit_depth_t bit_depth,
+                               unsigned int input_bit_depth) override {
+    double frame_ssim;
+    double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+    int crop_widths[PLANE_TYPES];
+    int crop_heights[PLANE_TYPES];
+    crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+    crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+    // Width of UV planes calculated based on chroma_shift values.
+    crop_widths[PLANE_TYPE_UV] =
+        img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+    crop_heights[PLANE_TYPE_UV] =
+        img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+    nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    uint8_t is_hbd = bit_depth > AOM_BITS_8;
+    if (is_hbd) {
+      // HBD ssim calculation.
+      uint8_t shift = bit_depth - input_bit_depth;
+      for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+        const int is_uv = i > AOM_PLANE_Y;
+        plane_ssim[i] = aom_highbd_ssim2(
+            CONVERT_TO_BYTEPTR(img_src->planes[i]),
+            CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+            img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+            crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+      }
+      frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                   .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+      // Accumulate to find sequence level ssim value.
+      ssim_ += frame_ssim;
+      return;
+    }
+#else
+    (void)bit_depth;
+    (void)input_bit_depth;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    // LBD ssim calculation.
+    for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+      const int is_uv = i > AOM_PLANE_Y;
+      plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+                                img_src->stride[is_uv], img_enc->stride[is_uv],
+                                crop_widths[is_uv], crop_heights[is_uv]);
+    }
+    frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                 .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+    // Accumulate to find sequence level ssim value.
+    ssim_ += frame_ssim;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+      encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+    }
+  }
+
+  double GetAverageSsim() const {
+    if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+    return 0.0;
+  }
+
+  double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                        kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double ssim = GetAverageSsim();
+    EXPECT_GT(ssim, GetSsimThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const TestVideoParam test_video_param_;
+  const int cpu_used_;
+  unsigned int nframes_;
+  double ssim_;
+};
+
+class EndToEndSSIMTestLarge : public EndToEndSSIMTest {};
+
+TEST_P(EndToEndSSIMTestLarge, EndtoEndSSIMTest) { DoTest(); }
+
+TEST_P(EndToEndSSIMTest, EndtoEndSSIMTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/error_resilience_test.cc b/third_party/libaom/source/libaom/test/error_resilience_test.cc
index 31906a47d0..3999c9146d 100644
--- a/third_party/libaom/source/libaom/test/error_resilience_test.cc
+++ b/third_party/libaom/source/libaom/test/error_resilience_test.cc
@@ -358,6 +358,10 @@ TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
 // if we lose (i.e., drop before decoding) a set of droppable
 // frames (i.e., frames that don't update any reference buffers).
 TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
+  if (GET_PARAM(1) == ::libaom_test::kOnePassGood && GET_PARAM(2) == 1) {
+    fprintf(stderr, "Skipping test case #1 because of bug aomedia:3002\n");
+    return;
+  }
   SetupEncoder(500, 10);
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                      cfg_.g_timebase.den, cfg_.g_timebase.num,
diff --git a/third_party/libaom/source/libaom/test/ethread_test.cc b/third_party/libaom/source/libaom/test/ethread_test.cc
index 5bf8762052..78811b65cf 100644
--- a/third_party/libaom/source/libaom/test/ethread_test.cc
+++ b/third_party/libaom/source/libaom/test/ethread_test.cc
@@ -21,6 +21,9 @@
 #include "av1/encoder/firstpass.h"
 
 namespace {
+const unsigned int kCqLevel = 18;
+
+#if !CONFIG_REALTIME_ONLY
 const size_t kFirstPassStatsSz = sizeof(FIRSTPASS_STATS);
 class AVxFirstPassEncoderThreadTest
     : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
@@ -196,6 +199,7 @@ TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   // Comparison 4 (between threads=4 and threads=8).
   compare_fp_stats_md5(&firstpass_stats);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadTest
     : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
@@ -227,11 +231,12 @@ class AVxEncoderThreadTest
   virtual void SetUp() {
     InitializeConfig(encoding_mode_);
 
-    if (encoding_mode_ != ::libaom_test::kRealTime) {
+    if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+        encoding_mode_ == ::libaom_test::kTwoPassGood) {
       cfg_.g_lag_in_frames = 6;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
-    } else {
+    } else if (encoding_mode_ == ::libaom_test::kRealTime) {
       cfg_.g_error_resilient = 1;
     }
     cfg_.rc_max_quantizer = 56;
@@ -248,18 +253,22 @@ class AVxEncoderThreadTest
       SetTileSize(encoder);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_ROW_MT, row_mt_);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
+      if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+          encoding_mode_ == ::libaom_test::kTwoPassGood) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 5);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
         encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
         encoder->Control(AV1E_SET_MAX_GF_INTERVAL, 4);
-      } else {
+      } else if (encoding_mode_ == ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(AV1E_SET_AQ_MODE, 3);
         encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 3);
+        encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 3);
+      } else {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
       }
       encoder_initialized_ = true;
     }
@@ -423,31 +432,60 @@ class AVxEncoderThreadTest
   std::vector<std::string> md5_dec_;
 };
 
-TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
   decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
-class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {};
+// For real time mode, test speed 6, 7, 8, 9.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(6, 7, 8, 9),
+                           ::testing::Values(0, 2), ::testing::Values(0, 2),
+                           ::testing::Values(0, 1));
 
-TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) {
+#if !CONFIG_REALTIME_ONLY
+
+// The AVxEncoderThreadTestLarge takes up ~14% of total run-time of the
+// Valgrind long tests. Exclude it; the smaller tests are still run.
+#if !AOM_VALGRIND_BUILD
+class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
   decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
-class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
+// Test cpu_used 0, 1, 3 and 5.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(0, 1, 3, 5),
+                           ::testing::Values(1, 6), ::testing::Values(1, 6),
+                           ::testing::Values(0, 1));
+#endif  // !AOM_VALGRIND_BUILD
 
-TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
   decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
-class AVxEncoderThreadRTTestLarge : public AVxEncoderThreadTest {};
+class AVxEncoderThreadAllIntraTest : public AVxEncoderThreadTest {};
 
-TEST_P(AVxEncoderThreadRTTestLarge, EncoderResultTest) {
+TEST_P(AVxEncoderThreadAllIntraTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadAllIntraTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTestLarge, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
   decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
@@ -466,26 +504,20 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTest,
                            ::testing::Values(2), ::testing::Values(0, 2),
                            ::testing::Values(0, 2), ::testing::Values(0, 1));
 
-// Test cpu_used 7, 8, 9 here.
-AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Values(7, 8, 9), ::testing::Values(0, 2),
+// For all intra mode, test speed 0, 2, 4, 6, 8.
+// Only test cpu_used 6 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(6), ::testing::Values(0, 2),
                            ::testing::Values(0, 2), ::testing::Values(0, 1));
 
-// Test cpu_used 0, 1, 3 and 5.
-AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
-                           ::testing::Values(::libaom_test::kTwoPassGood,
-                                             ::libaom_test::kOnePassGood),
-                           ::testing::Values(0, 1, 3, 5),
-                           ::testing::Values(1, 6), ::testing::Values(1, 6),
-                           ::testing::Values(0, 1));
-
-// Test cpu_used 0, 2, 4 and 6.
-AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTestLarge,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Values(0, 2, 4, 6),
+// Test cpu_used 0, 2, 4 and 8.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(0, 2, 4, 8),
                            ::testing::Values(1, 6), ::testing::Values(1, 6),
                            ::testing::Values(0, 1));
+#endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
@@ -512,6 +544,10 @@ TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
   DoTest();
 }
 
+// AVxEncoderThreadLSTestLarge takes up about 2% of total run-time of
+// the Valgrind long tests. Since we already run AVxEncoderThreadLSTest,
+// skip this one for Valgrind.
+#if !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
 class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
 
 TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
@@ -526,4 +562,5 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadLSTestLarge,
                                              ::libaom_test::kOnePassGood),
                            ::testing::Values(1, 3), ::testing::Values(0, 6),
                            ::testing::Values(0, 6), ::testing::Values(1));
+#endif  // !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc b/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc
index 5006b5b6cf..b060ee3913 100644
--- a/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc
+++ b/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc
@@ -199,6 +199,7 @@ int do_not_release_aom_frame_buffer(void *user_priv,
 
 #endif  // CONFIG_WEBM_IO
 
+#if !CONFIG_REALTIME_ONLY
 // Class for testing passing in external frame buffers to libaom.
 class ExternalFrameBufferMD5Test
     : public ::libaom_test::DecoderTest,
@@ -298,6 +299,7 @@ class ExternalFrameBufferMD5Test
   int num_buffers_;
   ExternalFrameBufferList fb_list_;
 };
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_WEBM_IO
 const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv";
@@ -395,6 +397,7 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
 };
 #endif  // CONFIG_WEBM_IO
 
+#if !CONFIG_REALTIME_ONLY
 // This test runs through the set of test vectors, and decodes them.
 // Libaom will call into the application to allocate a frame buffer when
 // needed. The md5 checksums are computed for each frame in the video file.
@@ -438,6 +441,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
   // Decode frame, and check the md5 matching.
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_WEBM_IO
 TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
@@ -447,7 +451,11 @@ TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
@@ -459,7 +467,11 @@ TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
@@ -470,10 +482,14 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
   // Only run this on long clips. Decoding a very short clip will return
   // AOM_CODEC_OK even with only 2 buffers.
   ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, NoRelease) {
@@ -481,8 +497,12 @@ TEST_F(ExternalFrameBufferTest, NoRelease) {
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     do_not_release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
   ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, NullRealloc) {
@@ -515,11 +535,15 @@ TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
 }
 
 TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame());
+#else
   const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
   ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
   ASSERT_EQ(AOM_CODEC_ERROR,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#endif
 }
 
 TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
@@ -527,14 +551,20 @@ TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+#endif
   CheckFrameBufferRelease();
 }
 #endif  // CONFIG_WEBM_IO
 
+#if !CONFIG_REALTIME_ONLY
 AV1_INSTANTIATE_TEST_SUITE(
     ExternalFrameBufferMD5Test,
     ::testing::ValuesIn(libaom_test::kAV1TestVectors,
                         libaom_test::kAV1TestVectors +
                             libaom_test::kNumAV1TestVectors));
+#endif
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/film_grain_table_test.cc b/third_party/libaom/source/libaom/test/film_grain_table_test.cc
index 524d67d7bc..31fb908ffa 100644
--- a/third_party/libaom/source/libaom/test/film_grain_table_test.cc
+++ b/third_party/libaom/source/libaom/test/film_grain_table_test.cc
@@ -101,6 +101,20 @@ TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
   aom_film_grain_table_free(&table);
 }
 
+TEST(FilmGrainTableTest, AddSingleSegmentRemoveBiggerSegment) {
+  aom_film_grain_table_t table;
+  aom_film_grain_t grain;
+
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 1100, true, &grain));
+
+  EXPECT_EQ(0, table.head);
+  EXPECT_EQ(0, table.tail);
+  aom_film_grain_table_free(&table);
+}
+
 TEST(FilmGrainTableTest, SplitSingleSegment) {
   aom_film_grain_table_t table;
   aom_film_grain_t grain;
diff --git a/third_party/libaom/source/libaom/test/frame_size_tests.cc b/third_party/libaom/source/libaom/test/frame_size_tests.cc
index 38b6a63c3d..2365a20c24 100644
--- a/third_party/libaom/source/libaom/test/frame_size_tests.cc
+++ b/third_party/libaom/source/libaom/test/frame_size_tests.cc
@@ -73,6 +73,7 @@ TEST_F(AV1FrameSizeTests, OneByOneVideo) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+#if !CONFIG_REALTIME_ONLY
 typedef struct {
   unsigned int width;
   unsigned int height;
@@ -129,5 +130,6 @@ TEST_P(AV1LosslessFrameSizeTests, LosslessEncode) {
 AV1_INSTANTIATE_TEST_SUITE(AV1LosslessFrameSizeTests,
                            ::testing::ValuesIn(FrameSizeTestParams),
                            testing::Values(::libaom_test::kAllIntra));
+#endif  // !CONFIG_REALTIME_ONLY
 
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/hbd_metrics_test.cc b/third_party/libaom/source/libaom/test/hbd_metrics_test.cc
index 8044b516c1..39c2b4c101 100644
--- a/third_party/libaom/source/libaom/test/hbd_metrics_test.cc
+++ b/third_party/libaom/source/libaom/test/hbd_metrics_test.cc
@@ -88,7 +88,7 @@ double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
 double compute_aomssim(const YV12_BUFFER_CONFIG *source,
                        const YV12_BUFFER_CONFIG *dest) {
   double ssim, weight;
-  aom_calc_ssim(source, dest, &weight, &ssim);
+  aom_lowbd_calc_ssim(source, dest, &weight, &ssim);
   return 100 * pow(ssim / weight, 8.0);
 }
 
diff --git a/third_party/libaom/source/libaom/test/horz_superres_test.cc b/third_party/libaom/source/libaom/test/horz_superres_test.cc
index 9733344111..2f0f3fdb6a 100644
--- a/third_party/libaom/source/libaom/test/horz_superres_test.cc
+++ b/third_party/libaom/source/libaom/test/horz_superres_test.cc
@@ -52,7 +52,7 @@ std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
 }
 
 const TestVideoParam kTestVideoVectors[] = {
-  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5,
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.4,
     45.0 },
 #if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0,
diff --git a/third_party/libaom/source/libaom/test/intrabc_test.cc b/third_party/libaom/source/libaom/test/intrabc_test.cc
index b57eb6fab5..2c60596ab8 100644
--- a/third_party/libaom/source/libaom/test/intrabc_test.cc
+++ b/third_party/libaom/source/libaom/test/intrabc_test.cc
@@ -153,8 +153,10 @@ TEST(IntrabcTest, DvValidation) {
   xd.plane[2].subsampling_x = 1;
   xd.plane[2].subsampling_y = 1;
 
+  SequenceHeader seq_params = {};
   AV1_COMMON cm;
   memset(&cm, 0, sizeof(cm));
+  cm.seq_params = &seq_params;
 
   for (const DvTestCase &dv_case : kDvCases) {
     const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;
diff --git a/third_party/libaom/source/libaom/test/invalid_file_test.cc b/third_party/libaom/source/libaom/test/invalid_file_test.cc
index 77839fafcd..6ac8d1ac32 100644
--- a/third_party/libaom/source/libaom/test/invalid_file_test.cc
+++ b/third_party/libaom/source/libaom/test/invalid_file_test.cc
@@ -151,6 +151,7 @@ const DecodeParam kAV1InvalidFileTests[] = {
   { 1, "invalid-oss-fuzz-10779.ivf", NULL },
   { 1, "invalid-oss-fuzz-11477.ivf", NULL },
   { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-33030.ivf", NULL },
 #endif
 };
 
diff --git a/third_party/libaom/source/libaom/test/kf_test.cc b/third_party/libaom/source/libaom/test/kf_test.cc
index cc2cc89c2b..2d228f2fef 100644
--- a/third_party/libaom/source/libaom/test/kf_test.cc
+++ b/third_party/libaom/source/libaom/test/kf_test.cc
@@ -100,10 +100,36 @@ class KeyFrameIntervalTestLarge
   aom_rc_mode end_usage_check_;
 };
 
+// Because valgrind builds take a very long time to run, use a lower
+// resolution video for valgrind runs.
+const char *TestFileName() {
+#if AOM_VALGRIND_BUILD
+  return "hantro_collage_w176h144.yuv";
+#else
+  return "hantro_collage_w352h288.yuv";
+#endif  // AOM_VALGRIND_BUILD
+}
+
+int TestFileWidth() {
+#if AOM_VALGRIND_BUILD
+  return 176;
+#else
+  return 352;
+#endif  // AOM_VALGRIND_BUILD
+}
+
+int TestFileHeight() {
+#if AOM_VALGRIND_BUILD
+  return 144;
+#else
+  return 288;
+#endif  // AOM_VALGRIND_BUILD
+}
+
 TEST_P(KeyFrameIntervalTestLarge, KeyFrameIntervalTest) {
-  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
-                                     0, 75);
+  libaom_test::I420VideoSource video(TestFileName(), TestFileWidth(),
+                                     TestFileHeight(), cfg_.g_timebase.den,
+                                     cfg_.g_timebase.num, 0, 75);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_EQ(is_kf_interval_violated_, false) << kf_dist_param_;
 }
@@ -187,9 +213,9 @@ TEST_P(ForcedKeyTestLarge, Frame1IsKey) {
     frame_num_ = 0;
     cfg_.g_lag_in_frames = lag_values[i];
     is_kf_placement_violated_ = false;
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0,
-                                       fwd_kf_enabled_ ? 60 : 30);
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(is_kf_placement_violated_, false)
         << "Frame #" << frame_num_ << " isn't a keyframe!";
@@ -207,9 +233,9 @@ TEST_P(ForcedKeyTestLarge, ForcedFrameIsKey) {
     forced_kf_frame_num_ = lag_values[i] - 1;
     cfg_.g_lag_in_frames = lag_values[i];
     is_kf_placement_violated_ = false;
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0,
-                                       fwd_kf_enabled_ ? 60 : 30);
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(is_kf_placement_violated_, false)
         << "Frame #" << frame_num_ << " isn't a keyframe!";
@@ -237,9 +263,9 @@ TEST_P(ForcedKeyTestLarge, ForcedFrameIsKeyCornerCases) {
     forced_kf_frame_num_ = (int)cfg_.kf_max_dist + kf_offsets[i];
     forced_kf_frame_num_ = forced_kf_frame_num_ > 0 ? forced_kf_frame_num_ : 1;
     is_kf_placement_violated_ = false;
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0,
-                                       fwd_kf_enabled_ ? 60 : 30);
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(is_kf_placement_violated_, false)
         << "Frame #" << frame_num_ << " isn't a keyframe!";
diff --git a/third_party/libaom/source/libaom/test/lossless_test.cc b/third_party/libaom/source/libaom/test/lossless_test.cc
index 92ab299ea9..c14bc06e5e 100644
--- a/third_party/libaom/source/libaom/test/lossless_test.cc
+++ b/third_party/libaom/source/libaom/test/lossless_test.cc
@@ -24,13 +24,14 @@ namespace {
 const int kMaxPsnr = 100;
 
 class LosslessTestLarge
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
-                                                 aom_rc_mode>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 aom_rc_mode, int>,
       public ::libaom_test::EncoderTest {
  protected:
   LosslessTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
-        encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)) {}
+        encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)) {}
 
   virtual ~LosslessTestLarge() {}
 
@@ -47,6 +48,7 @@ class LosslessTestLarge
       if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
         encoder->Control(AV1E_SET_LOSSLESS, 1);
       }
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
     }
   }
 
@@ -79,6 +81,7 @@ class LosslessTestLarge
   unsigned int nframes_;
   libaom_test::TestMode encoding_mode_;
   aom_rc_mode rc_end_usage_;
+  int cpu_used_;
   int base_qindex_;
 };
 
@@ -136,8 +139,33 @@ TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
+class LosslessAllIntraTestLarge : public LosslessTestLarge {};
+
+TEST_P(LosslessAllIntraTestLarge, TestLossLessEncodingCtrl) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
 AV1_INSTANTIATE_TEST_SUITE(LosslessTestLarge,
                            ::testing::Values(::libaom_test::kOnePassGood,
                                              ::libaom_test::kTwoPassGood),
-                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ),
+                           ::testing::Values(0));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(AOM_Q),
+                           ::testing::Values(6, 9));  // cpu_used
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/metadata_test.cc b/third_party/libaom/source/libaom/test/metadata_test.cc
index fd3d5c4932..b7b7f14f42 100644
--- a/third_party/libaom/source/libaom/test/metadata_test.cc
+++ b/third_party/libaom/source/libaom/test/metadata_test.cc
@@ -34,7 +34,7 @@ const size_t kMetadataPayloadSizeCll = 4;
 const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02,
                                                                0x03 };
 
-#if CONFIG_AV1_ENCODER
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 
 const size_t kMetadataObuSizeT35 = 28;
 const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = {
@@ -193,7 +193,7 @@ TEST_P(MetadataEncodeTest, TestMetadataEncoding) {
 AV1_INSTANTIATE_TEST_SUITE(MetadataEncodeTest,
                            ::testing::Values(::libaom_test::kOnePassGood));
 
-#endif  // CONFIG_AV1_ENCODER
+#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 }  // namespace
 
 TEST(MetadataTest, MetadataAllocation) {
diff --git a/third_party/libaom/source/libaom/test/monochrome_test.cc b/third_party/libaom/source/libaom/test/monochrome_test.cc
index 6395c22caf..a71cc9b3df 100644
--- a/third_party/libaom/source/libaom/test/monochrome_test.cc
+++ b/third_party/libaom/source/libaom/test/monochrome_test.cc
@@ -20,16 +20,45 @@
 
 namespace {
 
+const unsigned int kCqLevel = 18;
+const double kMaxPsnr = 100.0;
+
+// kPsnrThreshold represents the psnr threshold used to validate the quality of
+// the first frame. The indices, 0 and 1 correspond to non-allintra and allintra
+// encoding modes.
+const double kPsnrThreshold[2] = { 29.0, 41.5 };
+
+// kPsnrFluctuation represents the maximum allowed psnr fluctuation w.r.t first
+// frame. The indices, 0 and 1 correspond to non-allintra and allintra encoding
+// modes.
+const double kPsnrFluctuation[2] = { 2.5, 0.3 };
+
 class MonochromeTest
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
       public ::libaom_test::EncoderTest {
  protected:
-  MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {}
+  MonochromeTest()
+      : EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)),
+        frame0_psnr_y_(0.0) {}
 
   virtual ~MonochromeTest() {}
 
   virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3));
+      if (mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+      if (lossless_) {
+        encoder->Control(AV1E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      aom_codec_pts_t pts) {
     (void)pts;
@@ -68,15 +97,23 @@ class MonochromeTest
   }
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check average PSNR value is >= 100 db in case of lossless encoding.
+    if (lossless_) {
+      EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
+      return;
+    }
+    const bool is_allintra = (mode_ == ::libaom_test::kAllIntra);
     // Check that the initial Y PSNR value is 'high enough', and check that
     // subsequent Y PSNR values are 'close' to this initial value.
-    if (frame0_psnr_y_ == 0.) {
+    if (frame0_psnr_y_ == 0.0) {
       frame0_psnr_y_ = pkt->data.psnr.psnr[1];
-      EXPECT_GT(frame0_psnr_y_, 29.);
+      EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[is_allintra]);
     }
-    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5);
+    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_,
+                kPsnrFluctuation[is_allintra]);
   }
 
+  int lossless_;
   std::vector<int> chroma_value_list_;
   double frame0_psnr_y_;
 };
@@ -87,9 +124,6 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) {
 
   init_flags_ = AOM_CODEC_USE_PSNR;
 
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
-
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 600;
   cfg_.rc_buf_sz = 1000;
@@ -98,13 +132,10 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) {
   cfg_.rc_undershoot_pct = 50;
   cfg_.rc_overshoot_pct = 50;
   cfg_.rc_end_usage = AOM_CBR;
-  cfg_.kf_mode = AOM_KF_AUTO;
   cfg_.g_lag_in_frames = 1;
   cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
   // Enable dropped frames.
   cfg_.rc_dropframe_thresh = 1;
-  // Disable error_resilience mode.
-  cfg_.g_error_resilient = 0;
   // Run at low bitrate.
   cfg_.rc_target_bitrate = 40;
   // Set monochrome encoding flag
@@ -121,8 +152,33 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) {
   }
 }
 
+class MonochromeAllIntraTest : public MonochromeTest {};
+
+TEST_P(MonochromeAllIntraTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
 AV1_INSTANTIATE_TEST_SUITE(MonochromeTest,
                            ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood));
-
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(0),   // lossless
+                           ::testing::Values(0));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(MonochromeAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(0, 1),   // lossless
+                           ::testing::Values(6, 9));  // cpu_used
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/noise_model_test.cc b/third_party/libaom/source/libaom/test/noise_model_test.cc
index aad8905a45..c12c080cac 100644
--- a/third_party/libaom/source/libaom/test/noise_model_test.cc
+++ b/third_party/libaom/source/libaom/test/noise_model_test.cc
@@ -212,6 +212,12 @@ TEST(NoiseStrengthSolver, SimplifiesCurve) {
   aom_noise_strength_solver_free(&solver);
 }
 
+TEST(NoiseStrengthLut, LutInitNegativeOrZeroSize) {
+  aom_noise_strength_lut_t lut;
+  ASSERT_FALSE(aom_noise_strength_lut_init(&lut, -1));
+  ASSERT_FALSE(aom_noise_strength_lut_init(&lut, 0));
+}
+
 TEST(NoiseStrengthLut, LutEvalSinglePoint) {
   aom_noise_strength_lut_t lut;
   ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1));
diff --git a/third_party/libaom/source/libaom/test/quant_test.cc b/third_party/libaom/source/libaom/test/quant_test.cc
index 9fca953922..a042af13eb 100644
--- a/third_party/libaom/source/libaom/test/quant_test.cc
+++ b/third_party/libaom/source/libaom/test/quant_test.cc
@@ -20,6 +20,13 @@
 
 namespace {
 
+const ::libaom_test::TestMode kTestMode[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
 class QMTest
     : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
@@ -41,6 +48,11 @@ class QMTest
       encoder->Control(AV1E_SET_QM_MAX, qm_max_);
 
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+      if (mode_ == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      }
     }
   }
 
@@ -75,11 +87,10 @@ TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
 // encodes and decodes without a mismatch.
 TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
 
-AV1_INSTANTIATE_TEST_SUITE(QMTest,
-                           ::testing::Values(::libaom_test::kRealTime,
-                                             ::libaom_test::kOnePassGood),
+AV1_INSTANTIATE_TEST_SUITE(QMTest, ::testing::ValuesIn(kTestMode),
                            ::testing::Range(5, 9));
 
+#if !CONFIG_REALTIME_ONLY
 typedef struct {
   const unsigned int min_q;
   const unsigned int max_q;
@@ -173,4 +184,5 @@ AV1_INSTANTIATE_TEST_SUITE(QuantizerBoundsCheckTestLarge,
                                              ::libaom_test::kTwoPassGood),
                            ::testing::ValuesIn(QuantTestParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+#endif  // !CONFIG_REALTIME_ONLY
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/quantize_func_test.cc b/third_party/libaom/source/libaom/test/quantize_func_test.cc
index 3d79cf8bd8..3523050844 100644
--- a/third_party/libaom/source/libaom/test/quantize_func_test.cc
+++ b/third_party/libaom/source/libaom/test/quantize_func_test.cc
@@ -589,4 +589,5 @@ INSTANTIATE_TEST_SUITE_P(
                    static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
 
 #endif  // HAVE_AVX
+
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/rd_test.cc b/third_party/libaom/source/libaom/test/rd_test.cc
new file mode 100644
index 0000000000..0c481fcbb6
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/rd_test.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <vector>
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/rd.h"
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RdTest, GetDeltaqOffsetValueTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 4;
+  int q_index = 29;
+  int dc_q_step =
+      av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+  EXPECT_EQ(dc_q_step, 32);
+
+  int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+  EXPECT_EQ(ref_new_dc_q_step, 16);
+
+  int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+  int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+                                       static_cast<aom_bit_depth_t>(bit_depth));
+
+  EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetValueTest2) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 1.0 / 4.0;
+  int q_index = 29;
+  int dc_q_step =
+      av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+  EXPECT_EQ(dc_q_step, 32);
+
+  int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+  EXPECT_EQ(ref_new_dc_q_step, 64);
+
+  int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+  int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+                                       static_cast<aom_bit_depth_t>(bit_depth));
+
+  EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 0.000000001;
+  std::vector<int> q_index_ls = { 254, 255 };
+  for (auto q_index : q_index_ls) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(q_index + delta_q, 255);
+  }
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest2) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 100;
+  std::vector<int> q_index_ls = { 1, 0 };
+  for (auto q_index : q_index_ls) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(q_index + delta_q, 0);
+  }
+}
+
+TEST(RdTest, GetDeltaqOffsetUnitaryTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 1;
+  for (int q_index = 0; q_index < 255; ++q_index) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(delta_q, 0);
+  }
+}
+
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/resize_test.cc b/third_party/libaom/source/libaom/test/resize_test.cc
index cb09a9a193..68d610151d 100644
--- a/third_party/libaom/source/libaom/test/resize_test.cc
+++ b/third_party/libaom/source/libaom/test/resize_test.cc
@@ -203,6 +203,17 @@ class ResizeTest
 
   virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      if (GET_PARAM(1) == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_AQ_MODE, 3);
+        encoder->Control(AOME_SET_CPUUSED, 5);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      }
+    }
+  }
+
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      aom_codec_pts_t pts) {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
@@ -241,6 +252,7 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
 const unsigned int kStepDownFrame = 3;
 const unsigned int kStepUpFrame = 6;
 
+#if !CONFIG_REALTIME_ONLY
 class ResizeInternalTestLarge : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
@@ -362,6 +374,10 @@ TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood));
+#endif
+
 class ResizeRealtimeTest
     : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
@@ -375,6 +391,9 @@ class ResizeRealtimeTest
                                   libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_AQ_MODE, 3);
+      encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
     }
@@ -786,6 +805,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 // This class is used to check if there are any fatal
 // failures while encoding with resize-mode > 0
 class ResizeModeTestLarge
@@ -833,16 +853,6 @@ TEST_P(ResizeModeTestLarge, ResizeModeTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
-                           ::testing::Values(::libaom_test::kRealTime));
-AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood));
-AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Range(5, 10));
-AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
-                           ::testing::Values(::libaom_test::kRealTime));
-
 // TODO(anyone): Enable below test once resize issues are fixed
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge);
 // AV1_INSTANTIATE_TEST_SUITE(
@@ -851,4 +861,14 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge);
 //    ::libaom_test::kTwoPassGood),
 //    ::testing::Values(1, 2), ::testing::Values(8, 12, 16),
 //    ::testing::Values(8, 12, 16), ::testing::Range(2, 7));
+#endif  // !CONFIG_REALTIME_ONLY
+
+AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(6, 10));
+AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc b/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc
index e8a1a40d87..6d3704dbfc 100644
--- a/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc
+++ b/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc
@@ -42,9 +42,9 @@ std::unordered_map<std::string,
                            { 6, { { 0, 36.1 }, { 3, 36.5 } } },
                            { 7, { { 0, 35.5 }, { 3, 36.0 } } },
                            { 8, { { 0, 36.0 }, { 3, 36.5 } } },
-                           { 9, { { 0, 35.5 }, { 3, 36.1 } } } } },
+                           { 9, { { 0, 35.5 }, { 3, 36.0 } } } } },
                        { "niklas_1280_720_30.y4m",
-                         { { 5, { { 0, 34.4 }, { 3, 34.4 } } },
+                         { { 5, { { 0, 34.4 }, { 3, 34.32 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
                            { 7, { { 0, 33.6 }, { 3, 33.6 } } },
                            { 8, { { 0, 33.48 }, { 3, 33.48 } } },
@@ -125,6 +125,7 @@ class RTEndToEndTest
       encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
       encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
       encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
     }
   }
 
diff --git a/third_party/libaom/source/libaom/test/sad_test.cc b/third_party/libaom/source/libaom/test/sad_test.cc
index afd84a8ad2..037ed2455f 100644
--- a/third_party/libaom/source/libaom/test/sad_test.cc
+++ b/third_party/libaom/source/libaom/test/sad_test.cc
@@ -564,8 +564,8 @@ class DistWtdCompAvgTest
   void CheckCompAvg() {
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
-        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
 
         ReferenceDistWtdCompAvg(0);
         dist_wtd_comp_avg(0);
@@ -632,8 +632,8 @@ class DistWtdSADavgTest
   void CheckSAD() {
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
-        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
 
         const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
         const unsigned int exp_sad = dist_wtd_SAD_avg(0);
@@ -705,9 +705,7 @@ TEST_P(SADTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
-#define SPEED_TEST (0)
-#if SPEED_TEST
-TEST_P(SADTest, Speed) {
+TEST_P(SADTest, DISABLED_Speed) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -715,7 +713,6 @@ TEST_P(SADTest, Speed) {
   SpeedSAD();
   source_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADSkipTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -762,8 +759,7 @@ TEST_P(SADSkipTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
-#if SPEED_TEST
-TEST_P(SADSkipTest, Speed) {
+TEST_P(SADSkipTest, DISABLED_Speed) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -771,7 +767,6 @@ TEST_P(SADSkipTest, Speed) {
   SpeedSAD();
   source_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -1020,8 +1015,7 @@ TEST_P(SADx4Test, SrcAlignedByWidth) {
   source_data_ = tmp_source_data;
 }
 
-#if SPEED_TEST
-TEST_P(SADx4Test, Speed) {
+TEST_P(SADx4Test, DISABLED_Speed) {
   FillRandom(source_data_, source_stride_);
   FillRandom(GetReference(0), reference_stride_);
   FillRandom(GetReference(1), reference_stride_);
@@ -1029,7 +1023,6 @@ TEST_P(SADx4Test, Speed) {
   FillRandom(GetReference(3), reference_stride_);
   SpeedSAD();
 }
-#endif
 
 // SADSkipx4
 TEST_P(SADSkipx4Test, MaxRef) {
@@ -1104,8 +1097,7 @@ TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
   source_data_ = tmp_source_data;
 }
 
-#if SPEED_TEST
-TEST_P(SADSkipx4Test, Speed) {
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
   FillRandom(source_data_, source_stride_);
   FillRandom(GetReference(0), reference_stride_);
   FillRandom(GetReference(1), reference_stride_);
@@ -1113,12 +1105,10 @@ TEST_P(SADSkipx4Test, Speed) {
   FillRandom(GetReference(3), reference_stride_);
   SpeedSAD();
 }
-#endif
 
 using std::make_tuple;
 
-#if SPEED_TEST
-TEST_P(SADx4AvgTest, Speed) {
+TEST_P(SADx4AvgTest, DISABLED_Speed) {
   int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -1130,7 +1120,6 @@ TEST_P(SADx4AvgTest, Speed) {
   SpeedSAD();
   reference_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADx4AvgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
diff --git a/third_party/libaom/source/libaom/test/sharpness_test.cc b/third_party/libaom/source/libaom/test/sharpness_test.cc
new file mode 100644
index 0000000000..e74609bd9d
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/sharpness_test.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const unsigned int kCqLevel = 18;
+
+// List of psnr thresholds for different test combinations
+// keys: test-mode, cpu-used, sharpness.
+const std::unordered_map<
+    int, std::unordered_map<int, std::unordered_map<int, double>>>
+    kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood),
+                         { { 2, { { 2, 37.6 }, { 5, 37.6 } } },
+                           { 4, { { 2, 37.5 }, { 5, 37.5 } } },
+                           { 6, { { 2, 37.5 }, { 5, 37.5 } } } } },
+                       { static_cast<int>(::libaom_test::kAllIntra),
+                         { { 3, { { 2, 42.3 }, { 5, 42.4 } } },
+                           { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } },
+                           { 9, { { 2, 41.4 }, { 5, 41.4 } } } } } };
+
+// This class is used to test sharpness parameter configured through control
+// call using AOME_SET_SHARPNESS for different encoder configurations.
+class SharpnessTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SharpnessTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0),
+        nframes_(0) {}
+
+  ~SharpnessTest() override {}
+
+  void SetUp() override {
+    InitializeConfig(encoding_mode_);
+    if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+      cfg_.rc_target_bitrate = kBitrate;
+      cfg_.g_lag_in_frames = 5;
+    }
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_SHARPNESS, sharpness_level_);
+      if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold.at(encoding_mode_).at(cpu_used_).at(sharpness_level_);
+  }
+
+  void DoTest() {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource("paris_352_288_30.y4m", 0, kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_
+        << ", sharpness level = " << sharpness_level_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const int cpu_used_;
+  const int sharpness_level_;
+  double psnr_;
+  unsigned int nframes_;
+};
+
+class SharpnessTestLarge : public SharpnessTest {};
+
+class SharpnessAllIntraTest : public SharpnessTest {};
+
+class SharpnessAllIntraTestLarge : public SharpnessTest {};
+
+TEST_P(SharpnessTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTest, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(2, 4, 6),  // cpu_used
+                           ::testing::Values(2, 5));    // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(6),   // cpu_used
+                           ::testing::Values(4));  // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(3, 6, 9),  // cpu_used
+                           ::testing::Values(2, 5));    // sharpness level
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/svc_datarate_test.cc b/third_party/libaom/source/libaom/test/svc_datarate_test.cc
index 8d7376a554..d2839ccc61 100644
--- a/third_party/libaom/source/libaom/test/svc_datarate_test.cc
+++ b/third_party/libaom/source/libaom/test/svc_datarate_test.cc
@@ -80,6 +80,7 @@ class DatarateTestSVC
     mismatch_psnr_ = 0.0;
     set_frame_level_er_ = 0;
     multi_ref_ = 0;
+    use_fixed_mode_svc_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -89,6 +90,7 @@ class DatarateTestSVC
       initialize_svc(number_temporal_layers_, number_spatial_layers_,
                      &svc_params_);
       encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      // TODO(aomedia:3032): Configure KSVC in fixed mode.
       encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0);
       encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
       encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
@@ -110,7 +112,11 @@ class DatarateTestSVC
         set_layer_pattern(video->frame(), &layer_id_, &ref_frame_config_,
                           spatial_layer_id, multi_ref_);
     encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
-    encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    // The SET_SVC_REF_FRAME_CONFIG api is for the flexible SVC mode
+    // (i.e., use_fixed_mode_svc == 0).
+    if (!use_fixed_mode_svc_) {
+      encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    }
     if (set_frame_level_er_) {
       int mode =
           (layer_id_.spatial_layer_id > 0 || layer_id_.temporal_layer_id > 0);
@@ -170,7 +176,7 @@ class DatarateTestSVC
     int lag_index = 0;
     int base_count = frame_cnt >> 2;
     layer_id->spatial_layer_id = spatial_layer;
-    // Set the referende map buffer idx for the 7 references:
+    // Set the reference map buffer idx for the 7 references:
     // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
     // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
     for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
@@ -689,6 +695,48 @@ class DatarateTestSVC
     }
   }
 
+  virtual void BasicRateTargetingFixedModeSVC3TL3SLHDTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    use_fixed_mode_svc_ = 1;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   virtual void BasicRateTargetingSVC3TL3SLHDMT2Test() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -1101,6 +1149,7 @@ class DatarateTestSVC
   double mismatch_psnr_;
   int set_frame_level_er_;
   int multi_ref_;
+  int use_fixed_mode_svc_;
 };
 
 // Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
@@ -1142,6 +1191,12 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) {
 }
 
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for fixed mode SVC.
+TEST_P(DatarateTestSVC, BasicRateTargetingFixedModeSVC3TL3SLHD) {
+  BasicRateTargetingFixedModeSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
 // for 2 threads, 2 tile_columns, row-mt enabled.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMT2) {
   BasicRateTargetingSVC3TL3SLHDMT2Test();
diff --git a/third_party/libaom/source/libaom/test/tile_config_test.cc b/third_party/libaom/source/libaom/test/tile_config_test.cc
index 0098903aa8..517d54bd94 100644
--- a/third_party/libaom/source/libaom/test/tile_config_test.cc
+++ b/third_party/libaom/source/libaom/test/tile_config_test.cc
@@ -28,6 +28,14 @@ typedef struct {
   const unsigned int tile_cols;
 } uniformTileConfigParam;
 
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+      ::libaom_test::kTwoPassGood };
+#endif
+
 static const uniformTileConfigParam uniformTileConfigParams[] = {
   { 128, 0, 0 }, { 128, 0, 2 }, { 128, 2, 0 }, { 128, 1, 2 }, { 128, 2, 2 },
   { 128, 3, 2 }, { 64, 0, 0 },  { 64, 0, 2 },  { 64, 2, 0 },  { 64, 1, 2 },
@@ -254,14 +262,12 @@ TEST_P(NonUniformTileConfigTestLarge, NonUniformTileConfigTest) {
 }
 
 AV1_INSTANTIATE_TEST_SUITE(UniformTileConfigTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModeParams),
                            ::testing::ValuesIn(uniformTileConfigParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 
 AV1_INSTANTIATE_TEST_SUITE(NonUniformTileConfigTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModeParams),
                            ::testing::ValuesIn(nonUniformTileConfigParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 
@@ -352,7 +358,6 @@ TEST_P(TileGroupTestLarge, TileGroupCountTest) {
 }
 
 AV1_INSTANTIATE_TEST_SUITE(TileGroupTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModeParams),
                            ::testing::ValuesIn(tileGroupTestParams));
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/time_stamp_test.cc b/third_party/libaom/source/libaom/test/time_stamp_test.cc
index 205e5ba5bd..baa0dc06db 100644
--- a/third_party/libaom/source/libaom/test/time_stamp_test.cc
+++ b/third_party/libaom/source/libaom/test/time_stamp_test.cc
@@ -95,8 +95,13 @@ TEST_P(TimestampTest, TestAv1Rollover) {
   video.set_starting_pts(922337170351ll);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
-
+#if CONFIG_REALTIME_ONLY
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+#else
 AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
-                           ::testing::Values(::libaom_test::kTwoPassGood));
+                           ::testing::Values(::libaom_test::kRealTime,
+                                             ::libaom_test::kTwoPassGood));
+#endif
 
 }  // namespace
diff --git a/third_party/libaom/source/libaom/test/tpl_model_test.cc b/third_party/libaom/source/libaom/test/tpl_model_test.cc
new file mode 100644
index 0000000000..83845ee6d7
--- /dev/null
+++ b/third_party/libaom/source/libaom/test/tpl_model_test.cc
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <vector>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+double laplace_prob(double q_step, double b, double zero_bin_ratio,
+                    int qcoeff) {
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double p0 = 1 - z0;
+    return p0;
+  } else {
+    assert(abs_qcoeff > 0);
+    double z = fmax(exp(-q_step / b), TPL_EPSILON);
+    double p = z0 / 2 * (1 - z) * pow(z, abs_qcoeff - 1);
+    return p;
+  }
+}
+TEST(TplModelTest, ExponentialEntropyBoundaryTest1) {
+  double b = 0;
+  double q_step = 1;
+  double entropy = av1_exponential_entropy(q_step, b);
+  EXPECT_NEAR(entropy, 0, 0.00001);
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest1) {
+  // Check the consistency between av1_estimate_coeff_entropy() and
+  // laplace_prob()
+  double b = 1;
+  double q_step = 1;
+  double zero_bin_ratio = 2;
+  for (int qcoeff = -256; qcoeff < 256; ++qcoeff) {
+    double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+    double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+    double ref_rate = -log2(prob);
+    EXPECT_DOUBLE_EQ(rate, ref_rate);
+  }
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest2) {
+  // Check the consistency between av1_estimate_coeff_entropy(), laplace_prob()
+  // and av1_laplace_entropy()
+  double b = 1;
+  double q_step = 1;
+  double zero_bin_ratio = 2;
+  double est_expected_rate = 0;
+  for (int qcoeff = -20; qcoeff < 20; ++qcoeff) {
+    double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+    double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+    est_expected_rate += prob * rate;
+  }
+  double expected_rate = av1_laplace_entropy(q_step, b, zero_bin_ratio);
+  EXPECT_NEAR(expected_rate, est_expected_rate, 0.001);
+}
+
+TEST(TplModelTest, DeltaRateCostZeroFlow) {
+  // When srcrf_dist equal to recrf_dist, av1_delta_rate_cost should return 0
+  int64_t srcrf_dist = 256;
+  int64_t recrf_dist = 256;
+  int64_t delta_rate = 512;
+  int pixel_num = 256;
+  int64_t rate_cost =
+      av1_delta_rate_cost(delta_rate, recrf_dist, srcrf_dist, pixel_num);
+  EXPECT_EQ(rate_cost, 0);
+}
+
+// a reference function of av1_delta_rate_cost() with delta_rate using bit as
+// basic unit
+double ref_delta_rate_cost(int64_t delta_rate, double src_rec_ratio,
+                           int pixel_count) {
+  assert(src_rec_ratio <= 1 && src_rec_ratio >= 0);
+  double bits_per_pixel = (double)delta_rate / pixel_count;
+  double p = pow(2, bits_per_pixel);
+  double flow_rate_per_pixel =
+      sqrt(p * p / (src_rec_ratio * p * p + (1 - src_rec_ratio)));
+  double rate_cost = pixel_count * log2(flow_rate_per_pixel);
+  return rate_cost;
+}
+
+TEST(TplModelTest, DeltaRateCostReference) {
+  const int64_t scale = TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT;
+  std::vector<int64_t> srcrf_dist_arr = { 256, 257, 312 };
+  std::vector<int64_t> recrf_dist_arr = { 512, 288, 620 };
+  std::vector<int64_t> delta_rate_arr = { 10, 278, 100 };
+  for (size_t t = 0; t < srcrf_dist_arr.size(); ++t) {
+    int64_t srcrf_dist = srcrf_dist_arr[t];
+    int64_t recrf_dist = recrf_dist_arr[t];
+    int64_t delta_rate = delta_rate_arr[t];
+    int64_t scaled_delta_rate = delta_rate << scale;
+    int pixel_count = 256;
+    int64_t rate_cost = av1_delta_rate_cost(scaled_delta_rate, recrf_dist,
+                                            srcrf_dist, pixel_count);
+    rate_cost >>= scale;
+    double src_rec_ratio = (double)srcrf_dist / recrf_dist;
+    double ref_rate_cost =
+        ref_delta_rate_cost(delta_rate, src_rec_ratio, pixel_count);
+    EXPECT_NEAR((double)rate_cost, ref_rate_cost, 1);
+  }
+}
+
+TEST(TplModelTest, GetOverlapAreaHasOverlap) {
+  // The block a's area is [10, 17) x [18, 24).
+  // The block b's area is [8, 15) x [17, 23).
+  // The overlapping area between block a and block b is [10, 15) x [18, 23).
+  // Therefore, the size of the area is (15 - 10) * (23 - 18) = 25.
+  int row_a = 10;
+  int col_a = 18;
+  int row_b = 8;
+  int col_b = 17;
+  int height = 7;
+  int width = 6;
+  int overlap_area =
+      av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+  EXPECT_EQ(overlap_area, 25);
+}
+
+TEST(TplModelTest, GetOverlapAreaNoOverlap) {
+  // The block a's area is [10, 14) x [18, 22).
+  // The block b's area is [5, 9) x [5, 9).
+  // Threre is no overlapping area between block a and block b.
+  // Therefore, the return value should be zero.
+  int row_a = 10;
+  int col_a = 18;
+  int row_b = 5;
+  int col_b = 5;
+  int height = 4;
+  int width = 4;
+  int overlap_area =
+      av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+  EXPECT_EQ(overlap_area, 0);
+}
+
+TEST(TPLModelTest, EstimateFrameRateTest) {
+  /*
+   * Transform size: 16x16
+   * Frame count: 16
+   * Transform block count: 20
+   */
+  const int txfm_size = 256;  // 16x16
+  const int frame_count = 16;
+  unsigned char q_index_list[16];
+  TplTxfmStats stats_list[16];
+
+  for (int i = 0; i < frame_count; i++) {
+    q_index_list[i] = 1;
+    stats_list[i].txfm_block_count = 8;
+
+    for (int j = 0; j < txfm_size; j++) {
+      stats_list[i].abs_coeff_sum[j] = 0;
+    }
+  }
+
+  double result =
+      av1_estimate_gop_bitrate(q_index_list, frame_count, stats_list);
+  EXPECT_NEAR(result, 0, 0.1);
+}
+
+TEST(TPLModelTest, TxfmStatsInitTest) {
+  TplTxfmStats tpl_txfm_stats;
+  av1_init_tpl_txfm_stats(&tpl_txfm_stats);
+  EXPECT_EQ(tpl_txfm_stats.coeff_num, 256);
+  EXPECT_EQ(tpl_txfm_stats.txfm_block_count, 0);
+  for (int i = 0; i < tpl_txfm_stats.coeff_num; ++i) {
+    EXPECT_DOUBLE_EQ(tpl_txfm_stats.abs_coeff_sum[i], 0);
+  }
+}
+
+TEST(TPLModelTest, TxfmStatsAccumulateTest) {
+  TplTxfmStats sub_stats;
+  av1_init_tpl_txfm_stats(&sub_stats);
+  sub_stats.txfm_block_count = 17;
+  for (int i = 0; i < sub_stats.coeff_num; ++i) {
+    sub_stats.abs_coeff_sum[i] = i;
+  }
+
+  TplTxfmStats accumulated_stats;
+  av1_init_tpl_txfm_stats(&accumulated_stats);
+  accumulated_stats.txfm_block_count = 13;
+  for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+    accumulated_stats.abs_coeff_sum[i] = 5 * i;
+  }
+
+  av1_accumulate_tpl_txfm_stats(&sub_stats, &accumulated_stats);
+  EXPECT_DOUBLE_EQ(accumulated_stats.txfm_block_count, 30);
+  for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+    EXPECT_DOUBLE_EQ(accumulated_stats.abs_coeff_sum[i], 6 * i);
+  }
+}
+
+TEST(TPLModelTest, TxfmStatsRecordTest) {
+  TplTxfmStats stats1;
+  TplTxfmStats stats2;
+  av1_init_tpl_txfm_stats(&stats1);
+  av1_init_tpl_txfm_stats(&stats2);
+
+  tran_low_t coeff[256];
+  for (int i = 0; i < 256; ++i) {
+    coeff[i] = i;
+  }
+  av1_record_tpl_txfm_block(&stats1, coeff);
+  EXPECT_EQ(stats1.txfm_block_count, 1);
+
+  // we record the same transform block twice for testing purpose
+  av1_record_tpl_txfm_block(&stats2, coeff);
+  av1_record_tpl_txfm_block(&stats2, coeff);
+  EXPECT_EQ(stats2.txfm_block_count, 2);
+
+  EXPECT_EQ(stats1.coeff_num, 256);
+  EXPECT_EQ(stats2.coeff_num, 256);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_DOUBLE_EQ(stats2.abs_coeff_sum[i], 2 * stats1.abs_coeff_sum[i]);
+  }
+}
+
+}  // namespace
diff --git a/third_party/libaom/source/libaom/test/variance_test.cc b/third_party/libaom/source/libaom/test/variance_test.cc
index fa90305acd..6bb96ce46f 100644
--- a/third_party/libaom/source/libaom/test/variance_test.cc
+++ b/third_party/libaom/source/libaom/test/variance_test.cc
@@ -1004,8 +1004,8 @@ void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
         for (int y0 = 0; y0 < 4; ++y0) {
           uint32_t sse1, sse2;
           uint32_t var1, var2;
-          jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0];
-          jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1];
+          jcp_param_.fwd_offset = quant_dist_lookup_table[y0][x0];
+          jcp_param_.bck_offset = quant_dist_lookup_table[y0][1 - x0];
           ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
                                                        src_, width(), &sse1,
                                                        sec_, &jcp_param_));
diff --git a/third_party/libaom/source/libaom/test/warp_filter_test_util.cc b/third_party/libaom/source/libaom/test/warp_filter_test_util.cc
index 07a2e3f6e6..0e6e8b1324 100644
--- a/third_party/libaom/source/libaom/test/warp_filter_test_util.cc
+++ b/third_party/libaom/source/libaom/test/warp_filter_test_util.cc
@@ -226,8 +226,8 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
               av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
                                 out_h, out_w, sub_x, sub_y, &conv_params, alpha,
@@ -240,8 +240,8 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
               test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
                         out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
@@ -424,8 +424,8 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
 
               av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
@@ -441,8 +441,8 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
               test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
                         out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,
diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/fast.c b/third_party/libaom/source/libaom/third_party/fastfeat/fast.c
index f29ac8f725..30efde8396 100644
--- a/third_party/libaom/source/libaom/third_party/fastfeat/fast.c
+++ b/third_party/libaom/source/libaom/third_party/fastfeat/fast.c
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 #include <stdlib.h>
 #include "fast.h"
diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/fast.h b/third_party/libaom/source/libaom/third_party/fastfeat/fast.h
index a65d5a5d17..d7a9617cce 100644
--- a/third_party/libaom/source/libaom/third_party/fastfeat/fast.h
+++ b/third_party/libaom/source/libaom/third_party/fastfeat/fast.h
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 #ifndef FAST_H
 #define FAST_H
diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c b/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c
index 61c654c472..c0fdbe26cd 100644
--- a/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c
+++ b/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 /*This is mechanically generated code*/
 #include <stdlib.h>
diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c b/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c
index 0dbc660cb0..2e048e5460 100644
--- a/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c
+++ b/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 #include <stdlib.h>
 #include "fast.h"
diff --git a/third_party/libaom/source/libaom/third_party/vector/vector.c b/third_party/libaom/source/libaom/third_party/vector/vector.c
index 4b8b9c6fd9..2295b8f080 100644
--- a/third_party/libaom/source/libaom/third_party/vector/vector.c
+++ b/third_party/libaom/source/libaom/third_party/vector/vector.c
@@ -3,7 +3,7 @@ The MIT License(MIT)
 Copyright(c) 2016 Peter Goldsborough
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files(the "Software"), to deal in
+this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
diff --git a/third_party/libaom/source/libaom/third_party/vector/vector.h b/third_party/libaom/source/libaom/third_party/vector/vector.h
index d09eb64c93..acc70fe099 100644
--- a/third_party/libaom/source/libaom/third_party/vector/vector.h
+++ b/third_party/libaom/source/libaom/third_party/vector/vector.h
@@ -3,7 +3,7 @@ The MIT License(MIT)
 Copyright(c) 2016 Peter Goldsborough
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files(the "Software"), to deal in
+this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c
index 4c7afbaae5..bd445ab1b5 100644
--- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c
+++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 typedef struct S1 {
   int x;
 } T1;
diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c
index 8c14edc109..67ab58d520 100644
--- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c
+++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 typedef struct XD {
   int u;
   int v;
diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c
index 1934e20a75..26d5385e97 100644
--- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c
+++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 extern const int global_a[13];
 
 const int global_b = 0;
diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c
index 093ab55ac6..97113efc15 100644
--- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c
+++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 typedef struct RD {
   int u;
   int v;
diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c
index 330fc3a90c..dd89a15621 100644
--- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c
+++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 typedef struct S {
   int x;
   int y;
diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c
index 62b9d7adee..e14372c83e 100644
--- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c
+++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 typedef struct S1 {
   int x;
 } T1;
diff --git a/third_party/libyuv/CMakeLists.txt b/third_party/libyuv/CMakeLists.txt
index 60d17338ed..94bdfe0af6 100644
--- a/third_party/libyuv/CMakeLists.txt
+++ b/third_party/libyuv/CMakeLists.txt
@@ -24,3 +24,7 @@ if (NOT WINDOWS_MSVC_X86_64)
     target_link_libraries(libyuv_unittest ${ly_lib_name} gtest_main
         Threads::Threads)
 endif()
+
+if (LINUX_AARCH64)
+    target_compile_definitions(${ly_lib_name} PRIVATE LIBYUV_DISABLE_NEON=1)
+endif()
diff --git a/third_party/libyuv/include/libyuv/compare_row.h b/third_party/libyuv/include/libyuv/compare_row.h
index e95b9d93eb..64115b3a3f 100644
--- a/third_party/libyuv/include/libyuv/compare_row.h
+++ b/third_party/libyuv/include/libyuv/compare_row.h
@@ -55,20 +55,20 @@ extern "C" {
 
 // The following are available for Visual C and clangcl 32 bit:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    !defined(__clang__) &&                                                   \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
 #define HAS_HASHDJB2_AVX2
 #define HAS_SUMSQUAREERROR_AVX2
 #endif
 
-// The following are available for GCC and clangcl 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+// The following are available for GCC and clangcl:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_HAMMINGDISTANCE_SSSE3
 #endif
 
-// The following are available for GCC and clangcl 64 bit:
+// The following are available for GCC and clangcl:
 #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+    (defined(__x86_64__) || defined(__i386__))
 #define HAS_HAMMINGDISTANCE_AVX2
 #endif
 
diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h
index 40869ef218..93e7550be8 100644
--- a/third_party/libyuv/include/libyuv/convert.h
+++ b/third_party/libyuv/include/libyuv/convert.h
@@ -693,6 +693,19 @@ int RAWToI420(const uint8_t* src_raw,
               int width,
               int height);
 
+// RGB big endian (rgb in memory) to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+
 // RGB16 (RGBP fourcc) little endian to I420.
 LIBYUV_API
 int RGB565ToI420(const uint8_t* src_rgb565,
diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h
index 297de15162..eb4ebd54a8 100644
--- a/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/third_party/libyuv/include/libyuv/convert_argb.h
@@ -54,12 +54,30 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
   NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
 #define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
   NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
   I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
   I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 #define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
   I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 
 // Alias.
 #define ARGBToARGB ARGBCopy
@@ -125,32 +143,6 @@ int J420ToABGR(const uint8_t* src_y,
                int width,
                int height);
 
-// Convert F420 to ARGB. BT.709 full range
-LIBYUV_API
-int F420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert F420 to ABGR. BT.709 full range
-LIBYUV_API
-int F420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
 // Convert H420 to ARGB.
 LIBYUV_API
 int H420ToARGB(const uint8_t* src_y,
@@ -814,29 +806,29 @@ int I010ToAR30(const uint16_t* src_y,
                int width,
                int height);
 
-// Convert I010 to AB30.
+// Convert H010 to AR30.
 LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
+int H010ToAR30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
                const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
                int width,
                int height);
 
-// Convert H010 to AR30.
+// Convert I010 to AB30.
 LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
+int I010ToAB30(const uint16_t* src_y,
                int src_stride_y,
                const uint16_t* src_u,
                int src_stride_u,
                const uint16_t* src_v,
                int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
                int width,
                int height);
 
@@ -1073,6 +1065,42 @@ int AR30ToAB30(const uint8_t* src_ar30,
                int width,
                int height);
 
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AB64 to ABGR.
+#define AB64ToABGR AR64ToARGB
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+               int src_stride_ab64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AR64 to ABGR.
+#define AR64ToABGR AB64ToARGB
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height);
+
+// Convert AB64 To AR64.
+#define AB64ToAR64 AR64ToAB64
+
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
@@ -1385,6 +1413,19 @@ int I420ToAR30(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
 // Convert H420 to AR30.
 LIBYUV_API
 int H420ToAR30(const uint8_t* src_y,
@@ -1398,6 +1439,19 @@ int H420ToAR30(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
 // Convert I420 to ARGB with matrix.
 LIBYUV_API
 int I420ToARGBMatrix(const uint8_t* src_y,
@@ -1440,7 +1494,7 @@ int I444ToARGBMatrix(const uint8_t* src_y,
                      int width,
                      int height);
 
-// multiply 10 bit yuv into high bits to allow any number of bits.
+// Convert 10 bit 420 YUV to ARGB with matrix.
 LIBYUV_API
 int I010ToAR30Matrix(const uint16_t* src_y,
                      int src_stride_y,
@@ -1454,7 +1508,7 @@ int I010ToAR30Matrix(const uint16_t* src_y,
                      int width,
                      int height);
 
-// multiply 10 bit yuv into high bits to allow any number of bits.
+// Convert 10 bit 420 YUV to ARGB with matrix.
 LIBYUV_API
 int I210ToAR30Matrix(const uint16_t* src_y,
                      int src_stride_y,
@@ -1468,6 +1522,20 @@ int I210ToAR30Matrix(const uint16_t* src_y,
                      int width,
                      int height);
 
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
 // Convert 10 bit YUV to ARGB with matrix.
 LIBYUV_API
 int I010ToARGBMatrix(const uint16_t* src_y,
@@ -1482,6 +1550,34 @@ int I010ToARGBMatrix(const uint16_t* src_y,
                      int width,
                      int height);
 
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
 // Convert 10 bit 422 YUV to ARGB with matrix.
 LIBYUV_API
 int I210ToARGBMatrix(const uint16_t* src_y,
@@ -1496,6 +1592,87 @@ int I210ToARGBMatrix(const uint16_t* src_y,
                      int width,
                      int height);
 
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P010 to ARGB with matrix.
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P210 to ARGB with matrix.
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P010 to AR30 with matrix.
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P210 to AR30 with matrix.
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// P012 and P010 use most significant bits so the conversion is the same.
+// Convert P012 to ARGB with matrix.
+#define P012ToARGBMatrix P010ToARGBMatrix
+// Convert P012 to AR30 with matrix.
+#define P012ToAR30Matrix P010ToAR30Matrix
+// Convert P212 to ARGB with matrix.
+#define P212ToARGBMatrix P210ToARGBMatrix
+// Convert P212 to AR30 with matrix.
+#define P212ToAR30Matrix P210ToAR30Matrix
+
+// Convert P016 to ARGB with matrix.
+#define P016ToARGBMatrix P010ToARGBMatrix
+// Convert P016 to AR30 with matrix.
+#define P016ToAR30Matrix P010ToAR30Matrix
+// Convert P216 to ARGB with matrix.
+#define P216ToARGBMatrix P210ToARGBMatrix
+// Convert P216 to AR30 with matrix.
+#define P216ToAR30Matrix P210ToAR30Matrix
+
 // Convert I420 with Alpha to preattenuated ARGB with matrix.
 LIBYUV_API
 int I420AlphaToARGBMatrix(const uint8_t* src_y,
@@ -1547,6 +1724,57 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
                           int height,
                           int attenuate);
 
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
 // Convert NV12 to ARGB with matrix.
 LIBYUV_API
 int NV12ToARGBMatrix(const uint8_t* src_y,
diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h
index d992363ceb..bf48786041 100644
--- a/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -153,6 +153,30 @@ int ARGBToI444(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height);
+
+// Convert ABGR to AB64.
+#define ABGRToAB64 ARGBToAR64
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height);
+
+// Convert ABGR to AR64.
+#define ABGRToAR64 ARGBToAB64
+
 // Convert ARGB To I422.
 LIBYUV_API
 int ARGBToI422(const uint8_t* src_argb,
diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h
index ebefb5682f..def773cb44 100644
--- a/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/third_party/libyuv/include/libyuv/planar_functions.h
@@ -229,6 +229,60 @@ void MergeARGBPlane(const uint8_t* src_r,
                     int width,
                     int height);
 
+// Merge separate 'depth' bit R, G and B planes stored in lsb
+// into one interleaved XR30 plane.
+// depth should in range [10, 16]
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    uint8_t* dst_ar30,
+                    int dst_stride_ar30,
+                    int width,
+                    int height,
+                    int depth);
+
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved AR64 plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [1, 16]
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    const uint16_t* src_a,
+                    int src_stride_a,
+                    uint16_t* dst_ar64,
+                    int dst_stride_ar64,
+                    int width,
+                    int height,
+                    int depth);
+
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [8, 16]
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+                         int src_stride_r,
+                         const uint16_t* src_g,
+                         int src_stride_g,
+                         const uint16_t* src_b,
+                         int src_stride_b,
+                         const uint16_t* src_a,
+                         int src_stride_a,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height,
+                         int depth);
+
 // Copy I400.  Supports inverting.
 LIBYUV_API
 int I400ToI400(const uint8_t* src_y,
@@ -945,7 +999,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                         int width);
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
-// shuffler is 16 bytes and must be aligned.
+// shuffler is 16 bytes.
 LIBYUV_API
 int ARGBShuffle(const uint8_t* src_bgra,
                 int src_stride_bgra,
@@ -955,6 +1009,17 @@ int ARGBShuffle(const uint8_t* src_bgra,
                 int width,
                 int height);
 
+// Shuffle AR64 channel order.  e.g. AR64 to AB64.
+// shuffler is 16 bytes.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+                int src_stride_ar64,
+                uint16_t* dst_ar64,
+                int dst_stride_ar64,
+                const uint8_t* shuffler,
+                int width,
+                int height);
+
 // Sobel ARGB effect with planar output.
 LIBYUV_API
 int ARGBSobelToPlane(const uint8_t* src_argb,
diff --git a/third_party/libyuv/include/libyuv/rotate_row.h b/third_party/libyuv/include/libyuv/rotate_row.h
index 022293eef2..f4c701fb4f 100644
--- a/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/third_party/libyuv/include/libyuv/rotate_row.h
@@ -32,8 +32,9 @@ extern "C" {
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// The following are available for Visual C 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    !defined(__clang__)
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
index 68fb88b3e7..1444a04786 100644
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -175,8 +175,8 @@ extern "C" {
     defined(_MSC_VER)
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I444ALPHATOARGBROW_SSSE3
 #define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I444ALPHATOARGBROW_SSSE3
 #endif
 #endif
 
@@ -240,15 +240,15 @@ extern "C" {
     defined(_MSC_VER)
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I444ALPHATOARGBROW_AVX2
 #define HAS_I422ALPHATOARGBROW_AVX2
+#define HAS_I444ALPHATOARGBROW_AVX2
 #endif
 #endif
 
-// The following are available for AVX2 Visual C and clangcl 32 bit:
+// The following are available for AVX2 Visual C 32 bit:
 // TODO(fbarchard): Port to gcc.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+    !defined(__clang__) && defined(VISUALC_HAS_AVX2)
 #define HAS_ARGB1555TOARGBROW_AVX2
 #define HAS_ARGB4444TOARGBROW_AVX2
 #define HAS_ARGBTOARGB1555ROW_AVX2
@@ -269,33 +269,54 @@ extern "C" {
 
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_ABGRTOAR30ROW_SSSE3
 #define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR64ROW_SSSE3
+#define HAS_ARGBTOAB64ROW_SSSE3
+#define HAS_AR64TOARGBROW_SSSE3
+#define HAS_AB64TOARGBROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
+#define HAS_I212TOAR30ROW_SSSE3
+#define HAS_I212TOARGBROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOAR30ROW_SSSE3
+#define HAS_I410TOAR30ROW_SSSE3
+#define HAS_I410TOARGBROW_SSSE3
 #define HAS_MERGEARGBROW_SSE2
+#define HAS_MERGEXRGBROW_SSE2
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_MIRRORUVROW_SSSE3
+#define HAS_P210TOAR30ROW_SSSE3
+#define HAS_P210TOARGBROW_SSSE3
+#define HAS_P410TOAR30ROW_SSSE3
+#define HAS_P410TOARGBROW_SSSE3
 #define HAS_RAWTORGBAROW_SSSE3
 #define HAS_RGB24MIRRORROW_SSSE3
 #define HAS_RGBATOYJROW_SSSE3
 #define HAS_SPLITARGBROW_SSE2
 #define HAS_SPLITARGBROW_SSSE3
+#define HAS_SPLITXRGBROW_SSE2
+#define HAS_SPLITXRGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #define HAS_SWAPUVROW_SSSE3
+
+#if defined(__x86_64__) || !defined(__pic__)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I210ALPHATOARGBROW_SSSE3
+#define HAS_I410ALPHATOARGBROW_SSSE3
+#endif
 #endif
 
 // The following are available for AVX2 gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) &&                                       \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+#if !defined(LIBYUV_DISABLE_X86) &&               \
+    (defined(__x86_64__) || defined(__i386__)) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_ABGRTOAR30ROW_AVX2
 #define HAS_ABGRTOUVROW_AVX2
@@ -303,14 +324,32 @@ extern "C" {
 #define HAS_ARGBTOAR30ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_ARGBTOAR64ROW_AVX2
+#define HAS_ARGBTOAB64ROW_AVX2
+#define HAS_AR64TOARGBROW_AVX2
+#define HAS_AB64TOARGBROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
+#define HAS_MERGEAR64ROW_AVX2
+#define HAS_MERGEARGB16TO8ROW_AVX2
 #define HAS_MERGEARGBROW_AVX2
+#define HAS_MERGEXR30ROW_AVX2
+#define HAS_MERGEXR64ROW_AVX2
+#define HAS_MERGEXRGB16TO8ROW_AVX2
+#define HAS_MERGEXRGBROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
+#define HAS_I212TOAR30ROW_AVX2
+#define HAS_I212TOARGBROW_AVX2
 #define HAS_I400TOARGBROW_AVX2
+#define HAS_I410TOAR30ROW_AVX2
+#define HAS_I410TOARGBROW_AVX2
+#define HAS_P210TOAR30ROW_AVX2
+#define HAS_P210TOARGBROW_AVX2
+#define HAS_P410TOAR30ROW_AVX2
+#define HAS_P410TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
 #define HAS_I422TOUYVYROW_AVX2
 #define HAS_I422TOYUY2ROW_AVX2
@@ -319,18 +358,25 @@ extern "C" {
 #define HAS_MULTIPLYROW_16_AVX2
 #define HAS_RGBATOYJROW_AVX2
 #define HAS_SPLITARGBROW_AVX2
+#define HAS_SPLITXRGBROW_AVX2
 #define HAS_SPLITUVROW_16_AVX2
 #define HAS_SWAPUVROW_AVX2
 // TODO(fbarchard): Fix AVX2 version of YUV24
 // #define HAS_NV21TOYUV24ROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I210ALPHATOARGBROW_AVX2
+#define HAS_I410ALPHATOARGBROW_AVX2
+#endif
 #endif
 
 // The following are available for AVX512 clang x86 platforms:
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
-#if !defined(LIBYUV_DISABLE_X86) &&                                       \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
-    (defined(CLANG_HAS_AVX512))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512))
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #endif
 
@@ -353,6 +399,10 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565DITHERROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOAR64ROW_NEON
+#define HAS_ARGBTOAB64ROW_NEON
+#define HAS_AR64TOARGBROW_NEON
+#define HAS_AB64TOARGBROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
@@ -381,7 +431,13 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_NEON
 #define HAS_I444TOARGBROW_NEON
 #define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEAR64ROW_NEON
+#define HAS_MERGEARGB16TO8ROW_NEON
 #define HAS_MERGEARGBROW_NEON
+#define HAS_MERGEXR30ROW_NEON
+#define HAS_MERGEXR64ROW_NEON
+#define HAS_MERGEXRGB16TO8ROW_NEON
+#define HAS_MERGEXRGBROW_NEON
 #define HAS_MERGEUVROW_NEON
 #define HAS_MERGEUVROW_16_NEON
 #define HAS_MIRRORROW_NEON
@@ -412,6 +468,7 @@ extern "C" {
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
 #define HAS_SPLITARGBROW_NEON
+#define HAS_SPLITXRGBROW_NEON
 #define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_SPLITUVROW_16_NEON
@@ -490,24 +547,14 @@ extern "C" {
 #define HAS_BGRATOYROW_MSA
 #define HAS_HALFFLOATROW_MSA
 #define HAS_I400TOARGBROW_MSA
-#define HAS_I422ALPHATOARGBROW_MSA
-#define HAS_I422TOARGBROW_MSA
-#define HAS_I422TORGB24ROW_MSA
-#define HAS_I422TORGBAROW_MSA
 #define HAS_I422TOUYVYROW_MSA
 #define HAS_I422TOYUY2ROW_MSA
-#define HAS_I444TOARGBROW_MSA
-#define HAS_I422TOARGB1555ROW_MSA
-#define HAS_I422TORGB565ROW_MSA
 #define HAS_INTERPOLATEROW_MSA
 #define HAS_J400TOARGBROW_MSA
 #define HAS_MERGEUVROW_MSA
 #define HAS_MIRRORROW_MSA
 #define HAS_MIRRORUVROW_MSA
 #define HAS_MIRRORSPLITUVROW_MSA
-#define HAS_NV12TOARGBROW_MSA
-#define HAS_NV12TORGB565ROW_MSA
-#define HAS_NV21TOARGBROW_MSA
 #define HAS_RAWTOARGBROW_MSA
 #define HAS_RAWTORGB24ROW_MSA
 #define HAS_RAWTOUVROW_MSA
@@ -527,10 +574,8 @@ extern "C" {
 #define HAS_SOBELXYROW_MSA
 #define HAS_SOBELYROW_MSA
 #define HAS_SPLITUVROW_MSA
-#define HAS_UYVYTOARGBROW_MSA
 #define HAS_UYVYTOUVROW_MSA
 #define HAS_UYVYTOYROW_MSA
-#define HAS_YUY2TOARGBROW_MSA
 #define HAS_YUY2TOUV422ROW_MSA
 #define HAS_YUY2TOUVROW_MSA
 #define HAS_YUY2TOYROW_MSA
@@ -580,8 +625,6 @@ extern "C" {
 #define HAS_I400TOARGBROW_MMI
 #define HAS_I422TOUYVYROW_MMI
 #define HAS_I422TOYUY2ROW_MMI
-#define HAS_I422TOARGBROW_MMI
-#define HAS_I444TOARGBROW_MMI
 #define HAS_INTERPOLATEROW_MMI
 #define HAS_J400TOARGBROW_MMI
 #define HAS_MERGERGBROW_MMI
@@ -612,20 +655,6 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_MMI
 #define HAS_YUY2TOUVROW_MMI
 #define HAS_YUY2TOYROW_MMI
-#define HAS_I210TOARGBROW_MMI
-#define HAS_I422TOARGB4444ROW_MMI
-#define HAS_I422TOARGB1555ROW_MMI
-#define HAS_I422TORGB565ROW_MMI
-#define HAS_NV21TORGB24ROW_MMI
-#define HAS_NV12TORGB24ROW_MMI
-#define HAS_I422ALPHATOARGBROW_MMI
-#define HAS_I422TORGB24ROW_MMI
-#define HAS_NV12TOARGBROW_MMI
-#define HAS_NV21TOARGBROW_MMI
-#define HAS_NV12TORGB565ROW_MMI
-#define HAS_YUY2TOARGBROW_MMI
-#define HAS_UYVYTOARGBROW_MMI
-#define HAS_I422TORGBAROW_MMI
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -634,6 +663,7 @@ extern "C" {
 #else
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #endif
+#define LIBYUV_NOINLINE __declspec(noinline)
 typedef __declspec(align(16)) int16_t vec16[8];
 typedef __declspec(align(16)) int32_t vec32[4];
 typedef __declspec(align(16)) float vecf32[4];
@@ -654,6 +684,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
 #else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #endif
+#define LIBYUV_NOINLINE __attribute__((noinline))
 typedef int16_t __attribute__((vector_size(16))) vec16;
 typedef int32_t __attribute__((vector_size(16))) vec32;
 typedef float __attribute__((vector_size(16))) vecf32;
@@ -669,6 +700,7 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32;
 typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
+#define LIBYUV_NOINLINE
 typedef int16_t vec16[8];
 typedef int32_t vec32[4];
 typedef float vecf32[4];
@@ -684,33 +716,18 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif
 
-#if defined(__aarch64__)
-// This struct is for Arm64 color conversion.
-struct YuvConstants {
-  uvec16 kUVToRB;
-  uvec16 kUVToRB2;
-  uvec16 kUVToG;
-  uvec16 kUVToG2;
-  vec16 kUVBiasBGR;
-  vec32 kYToRgb;
-};
-#elif defined(__arm__)
-// This struct is for ArmV7 color conversion.
+#if defined(__aarch64__) || defined(__arm__)
+// This struct is for ARM color conversion.
 struct YuvConstants {
-  uvec8 kUVToRB;
-  uvec8 kUVToG;
-  vec16 kUVBiasBGR;
-  vec32 kYToRgb;
+  uvec8 kUVCoeff;
+  vec16 kRGBCoeffBias;
 };
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
-  int8_t kUVToB[32];
-  int8_t kUVToG[32];
-  int8_t kUVToR[32];
-  int16_t kUVBiasB[16];
-  int16_t kUVBiasG[16];
-  int16_t kUVBiasR[16];
+  uint8_t kUVToB[32];
+  uint8_t kUVToG[32];
+  uint8_t kUVToR[32];
   int16_t kYToRgb[16];
   int16_t kYBiasToRgb[16];
 };
@@ -719,11 +736,8 @@ struct YuvConstants {
 #define KUVTOB 0
 #define KUVTOG 32
 #define KUVTOR 64
-#define KUVBIASB 96
-#define KUVBIASG 128
-#define KUVBIASR 160
-#define KYTORGB 192
-#define KYBIASTORGB 224
+#define KYTORGB 96
+#define KYBIASTORGB 128
 
 #endif
 
@@ -995,11 +1009,11 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
-void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
-void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@@ -1194,16 +1208,16 @@ void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
 
-void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
-void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
@@ -1305,42 +1319,42 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        uint8_t* dst_v,
                        int width);
 void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
-                            int src_stride_ptr,
+                            int src_stride,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
 void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
@@ -1349,7 +1363,7 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -1372,47 +1386,47 @@ void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
                          uint8_t* dst_v,
                          int width);
 void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
-                         int src_stride_ptr,
+                         int src_stride,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
 void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                            int src_stride_ptr,
+                            int src_stride,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
 void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                              int src_stride_ptr,
+                              int src_stride,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width);
 void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                              int src_stride_ptr,
+                              int src_stride,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width);
@@ -1621,7 +1635,7 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv,
 
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
@@ -1637,9 +1651,13 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
-void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
-void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_rgb24,
+                          int width);
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width);
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
 void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
@@ -1860,23 +1878,23 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width);
-void MergeARGBRow_Any_SSE2(const uint8_t* src_r,
-                           const uint8_t* src_g,
-                           const uint8_t* src_b,
-                           const uint8_t* src_a,
-                           uint8_t* dst_argb,
+void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           const uint8_t* a_buf,
+                           uint8_t* dst_ptr,
                            int width);
-void MergeARGBRow_Any_AVX2(const uint8_t* src_r,
-                           const uint8_t* src_g,
-                           const uint8_t* src_b,
-                           const uint8_t* src_a,
-                           uint8_t* dst_argb,
+void MergeARGBRow_Any_AVX2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           const uint8_t* a_buf,
+                           uint8_t* dst_ptr,
                            int width);
-void MergeARGBRow_Any_NEON(const uint8_t* src_r,
-                           const uint8_t* src_g,
-                           const uint8_t* src_b,
-                           const uint8_t* src_a,
-                           uint8_t* dst_argb,
+void MergeARGBRow_Any_NEON(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           const uint8_t* a_buf,
+                           uint8_t* dst_ptr,
                            int width);
 void SplitARGBRow_C(const uint8_t* src_argb,
                     uint8_t* dst_r,
@@ -1902,31 +1920,31 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width);
-void SplitARGBRow_NEON(const uint8_t* src_argb,
+void SplitARGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width);
-void SplitARGBRow_Any_SSE2(const uint8_t* src_argb,
+void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            uint8_t* dst_a,
                            int width);
-void SplitARGBRow_Any_SSSE3(const uint8_t* src_argb,
+void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_r,
                             uint8_t* dst_g,
                             uint8_t* dst_b,
                             uint8_t* dst_a,
                             int width);
-void SplitARGBRow_Any_AVX2(const uint8_t* src_argb,
+void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            uint8_t* dst_a,
                            int width);
-void SplitARGBRow_Any_NEON(const uint8_t* src_argb,
+void SplitARGBRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
@@ -1952,20 +1970,20 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width);
-void MergeXRGBRow_Any_SSE2(const uint8_t* src_r,
-                           const uint8_t* src_g,
-                           const uint8_t* src_b,
-                           uint8_t* dst_argb,
+void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
                            int width);
-void MergeXRGBRow_Any_AVX2(const uint8_t* src_r,
-                           const uint8_t* src_g,
-                           const uint8_t* src_b,
-                           uint8_t* dst_argb,
+void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
                            int width);
-void MergeXRGBRow_Any_NEON(const uint8_t* src_r,
-                           const uint8_t* src_g,
-                           const uint8_t* src_b,
-                           uint8_t* dst_argb,
+void MergeXRGBRow_Any_NEON(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
                            int width);
 void SplitXRGBRow_C(const uint8_t* src_argb,
                     uint8_t* dst_r,
@@ -1987,32 +2005,205 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width);
-void SplitXRGBRow_NEON(const uint8_t* src_argb,
+void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width);
-void SplitXRGBRow_Any_SSE2(const uint8_t* src_argb,
+void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            int width);
-void SplitXRGBRow_Any_SSSE3(const uint8_t* src_argb,
+void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_r,
                             uint8_t* dst_g,
                             uint8_t* dst_b,
                             int width);
-void SplitXRGBRow_Any_AVX2(const uint8_t* src_argb,
+void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            int width);
-void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
+void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            int width);
 
+void MergeXR30Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint8_t* dst_ar30,
+                    int depth,
+                    int width);
+void MergeAR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    const uint16_t* src_a,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width);
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         const uint16_t* src_a,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width);
+void MergeXR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width);
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width);
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width);
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR30Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width);
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+                          const uint16_t* src_g,
+                          const uint16_t* src_b,
+                          uint8_t* dst_ar30,
+                          int /* depth */,
+                          int width);
+void MergeAR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR30Row_Any_AVX2(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint8_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeAR64Row_Any_AVX2(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           const uint16_t* a_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeXR64Row_Any_AVX2(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                const uint16_t* a_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+void MergeXR30Row_Any_NEON(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint8_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf,
+                              const uint16_t* g_buf,
+                              const uint16_t* b_buf,
+                              uint8_t* dst_ptr,
+                              int depth,
+                              int width);
+void MergeAR64Row_Any_NEON(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           const uint16_t* a_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                const uint16_t* a_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+void MergeXR64Row_Any_NEON(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+
 void MergeUVRow_16_C(const uint16_t* src_u,
                      const uint16_t* src_v,
                      uint16_t* dst_uv,
@@ -2024,10 +2215,10 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         int depth,
                         int width);
 void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int depth,
-                        int width);
+                            const uint16_t* src_v,
+                            uint16_t* dst_uv,
+                            int depth,
+                            int width);
 void MergeUVRow_16_NEON(const uint16_t* src_u,
                         const uint16_t* src_v,
                         uint16_t* dst_uv,
@@ -2073,16 +2264,16 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width);
-void MultiplyRow_16_Any_AVX2(const uint16_t* src_y,
-                             uint16_t* dst_y,
+void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
                              int scale,
                              int width);
 void MultiplyRow_16_NEON(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width);
-void MultiplyRow_16_Any_NEON(const uint16_t* src_y,
-                             uint16_t* dst_y,
+void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
                              int scale,
                              int width);
 
@@ -2094,16 +2285,16 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width);
-void DivideRow_16_Any_AVX2(const uint16_t* src_y,
-                           uint16_t* dst_y,
+void DivideRow_16_Any_AVX2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
                            int scale,
                            int width);
 void DivideRow_16_NEON(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width);
-void DivideRow_16_Any_NEON(const uint16_t* src_y,
-                           uint16_t* dst_y,
+void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
                            int scale,
                            int width);
 
@@ -2527,6 +2718,71 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
 void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
 
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+                      uint8_t* dst_ar64,
+                      const uint8_t* shuffler,
+                      int width);
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ar64,
+                         int width);
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ab64,
+                         int width);
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+                         uint8_t* dst_argb,
+                         int width);
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int width);
+void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int width);
+void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
@@ -2575,6 +2831,44 @@ void I210ToARGBRow_C(const uint16_t* src_y,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void I212ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I212ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
 void I444AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -2626,6 +2920,27 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void P210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void P410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void P210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void P410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+
 void I422ToRGBARow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -2705,6 +3020,44 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
                          uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                              const uint16_t* u_buf,
+                              const uint16_t* v_buf,
+                              const uint16_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                              const uint16_t* u_buf,
+                              const uint16_t* v_buf,
+                              const uint16_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I422ToAR30Row_AVX2(const uint8_t* y_buf,
                         const uint8_t* u_buf,
                         const uint8_t* v_buf,
@@ -2723,6 +3076,44 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf,
                         uint8_t* dst_ar30,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I212ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I212ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I410ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I410ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                               const uint8_t* u_buf,
                               const uint8_t* v_buf,
@@ -2821,6 +3212,48 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+
+void P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P210ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+
 void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
@@ -2923,6 +3356,44 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                                  const uint16_t* u_buf,
+                                  const uint16_t* v_buf,
+                                  const uint16_t* a_buf,
+                                  uint8_t* dst_ptr,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
+void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                                  const uint16_t* u_buf,
+                                  const uint16_t* v_buf,
+                                  const uint16_t* a_buf,
+                                  uint8_t* dst_ptr,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
 void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -2941,6 +3412,44 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                                 const uint16_t* u_buf,
+                                 const uint16_t* v_buf,
+                                 const uint16_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                                 const uint16_t* u_buf,
+                                 const uint16_t* v_buf,
+                                 const uint16_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
 void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                                   const uint8_t* u_buf,
                                   const uint8_t* v_buf,
@@ -3039,6 +3548,46 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -3120,15 +3669,15 @@ void I400ToARGBRow_MMI(const uint8_t* src_y,
                        int width);
 void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
+                            const struct YuvConstants* param,
                             int width);
 void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
+                            const struct YuvConstants* param,
                             int width);
 void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
+                            const struct YuvConstants* param,
                             int width);
 void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
@@ -3140,11 +3689,11 @@ void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
                            int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width);
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width);
@@ -3156,7 +3705,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width);
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_C(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width);
@@ -3200,11 +3749,11 @@ void BlendPlaneRow_C(const uint8_t* src0,
 
 // ARGB multiply images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width);
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -3212,7 +3761,7 @@ void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -3220,7 +3769,7 @@ void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -3246,11 +3795,11 @@ void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
                              int width);
 
 // ARGB add images.
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
                   const uint8_t* src_argb1,
                   uint8_t* dst_argb,
                   int width);
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
@@ -3258,7 +3807,7 @@ void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
@@ -3266,7 +3815,7 @@ void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
@@ -3293,11 +3842,11 @@ void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
 
 // ARGB subtract images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width);
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -3305,7 +3854,7 @@ void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -3313,7 +3862,7 @@ void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -3520,9 +4069,9 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
-                             const uint8_t* src_vu,
-                             uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              int width);
 void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
@@ -3537,6 +4086,46 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void P210ToARGBRow_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToARGBRow_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToAR30Row_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToAR30Row_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P210ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -3684,7 +4273,7 @@ void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       int width);
 void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3694,7 +4283,7 @@ void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
                              int width);
 void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3704,7 +4293,7 @@ void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
                              int width);
 void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3805,7 +4394,7 @@ void UYVYToUV422Row_C(const uint8_t* src_uyvy,
                       int width);
 void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3815,7 +4404,7 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
                              int width);
 void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3825,7 +4414,7 @@ void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
                              int width);
 void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3862,29 +4451,29 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
 void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
 void AYUVToUVRow_C(const uint8_t* src_ayuv,
-                   int stride_ayuv,
+                   int src_stride_ayuv,
                    uint8_t* dst_uv,
                    int width);
 void AYUVToVURow_C(const uint8_t* src_ayuv,
-                   int stride_ayuv,
+                   int src_stride_ayuv,
                    uint8_t* dst_vu,
                    int width);
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int stride_ayuv,
+                      int src_stride_ayuv,
                       uint8_t* dst_uv,
                       int width);
 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int stride_ayuv,
+                      int src_stride_ayuv,
                       uint8_t* dst_vu,
                       int width);
-void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
-                          int stride_ayuv,
-                          uint8_t* dst_uv,
+void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_vu,
                           int width);
-void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
-                          int stride_ayuv,
+void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride,
                           uint8_t* dst_vu,
                           int width);
 
diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
index 18ffb546a3..461ac36f33 100644
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -74,15 +74,16 @@ extern "C" {
 
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SCALEUVROWDOWN2BOX_SSSE3
 #define HAS_SCALEROWUP2LINEAR_SSE2
 #define HAS_SCALEROWUP2LINEAR_SSSE3
 #define HAS_SCALEROWUP2BILINEAR_SSE2
 #define HAS_SCALEROWUP2BILINEAR_SSSE3
-#define HAS_SCALEROWUP2LINEAR_16_SSSE3
-#define HAS_SCALEROWUP2BILINEAR_16_SSSE3
+#define HAS_SCALEROWUP2LINEAR_12_SSSE3
+#define HAS_SCALEROWUP2BILINEAR_12_SSSE3
+#define HAS_SCALEROWUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_16_SSE2
 #define HAS_SCALEUVROWUP2LINEAR_SSSE3
 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3
 #define HAS_SCALEUVROWUP2LINEAR_16_SSE2
@@ -92,12 +93,14 @@ extern "C" {
 // The following are available for gcc/clang x86 platforms, but
 // require clang 3.4 or gcc 4.7.
 // TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) &&                                     \
-    (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
+#if !defined(LIBYUV_DISABLE_X86) &&               \
+    (defined(__x86_64__) || defined(__i386__)) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_SCALEUVROWDOWN2BOX_AVX2
 #define HAS_SCALEROWUP2LINEAR_AVX2
 #define HAS_SCALEROWUP2BILINEAR_AVX2
+#define HAS_SCALEROWUP2LINEAR_12_AVX2
+#define HAS_SCALEROWUP2BILINEAR_12_AVX2
 #define HAS_SCALEROWUP2LINEAR_16_AVX2
 #define HAS_SCALEROWUP2BILINEAR_16_AVX2
 #define HAS_SCALEUVROWUP2LINEAR_AVX2
@@ -134,6 +137,8 @@ extern "C" {
 #define HAS_SCALEUVROWDOWNEVEN_NEON
 #define HAS_SCALEROWUP2LINEAR_NEON
 #define HAS_SCALEROWUP2BILINEAR_NEON
+#define HAS_SCALEROWUP2LINEAR_12_NEON
+#define HAS_SCALEROWUP2BILINEAR_12_NEON
 #define HAS_SCALEROWUP2LINEAR_16_NEON
 #define HAS_SCALEROWUP2BILINEAR_16_NEON
 #define HAS_SCALEUVROWUP2LINEAR_NEON
@@ -611,14 +616,22 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width);
-void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr,
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width);
-void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width);
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width);
@@ -635,6 +648,14 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width);
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width);
@@ -651,9 +672,17 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width);
-void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr,
+void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr,
                                      uint16_t* dst_ptr,
                                      int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
+                                       ptrdiff_t src_stride,
+                                       uint16_t* dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       int dst_width);
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
 void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
                                        ptrdiff_t src_stride,
                                        uint16_t* dst_ptr,
@@ -675,6 +704,14 @@ void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width);
+void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
 void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     int dst_width);
@@ -1424,6 +1461,14 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width);
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
 void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width);
@@ -1440,6 +1485,14 @@ void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width);
+void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
 void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     int dst_width);
diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h
index e59b316a60..f713c47704 100644
--- a/third_party/libyuv/include/libyuv/version.h
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1779
+#define LIBYUV_VERSION 1788
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h
index 0da3fb5544..32b8a5210b 100644
--- a/third_party/libyuv/include/libyuv/video_common.h
+++ b/third_party/libyuv/include/libyuv/video_common.h
@@ -65,12 +65,14 @@ enum FourCC {
   // 1 Secondary YUV format: row biplanar.  deprecated.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
 
-  // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
+  // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
   FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
   FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
+  FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
+  FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
   FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
   FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
   FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
@@ -163,6 +165,8 @@ enum FourCCBpp {
   FOURCC_BPP_RGBA = 32,
   FOURCC_BPP_AR30 = 32,
   FOURCC_BPP_AB30 = 32,
+  FOURCC_BPP_AR64 = 64,
+  FOURCC_BPP_AB64 = 64,
   FOURCC_BPP_24BG = 24,
   FOURCC_BPP_RAW = 24,
   FOURCC_BPP_RGBP = 16,
diff --git a/third_party/libyuv/source/compare_common.cc b/third_party/libyuv/source/compare_common.cc
index d4b170ad98..d1cab8d2b4 100644
--- a/third_party/libyuv/source/compare_common.cc
+++ b/third_party/libyuv/source/compare_common.cc
@@ -17,36 +17,6 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if ORIGINAL_OPT
-uint32_t HammingDistance_C1(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t diff = 0u;
-
-  int i;
-  for (i = 0; i < count; ++i) {
-    int x = src_a[i] ^ src_b[i];
-    if (x & 1)
-      ++diff;
-    if (x & 2)
-      ++diff;
-    if (x & 4)
-      ++diff;
-    if (x & 8)
-      ++diff;
-    if (x & 16)
-      ++diff;
-    if (x & 32)
-      ++diff;
-    if (x & 64)
-      ++diff;
-    if (x & 128)
-      ++diff;
-  }
-  return diff;
-}
-#endif
-
 // Hakmem method for hamming distance.
 uint32_t HammingDistance_C(const uint8_t* src_a,
                            const uint8_t* src_b,
diff --git a/third_party/libyuv/source/compare_gcc.cc b/third_party/libyuv/source/compare_gcc.cc
index 6700f9697e..b834b42ac4 100644
--- a/third_party/libyuv/source/compare_gcc.cc
+++ b/third_party/libyuv/source/compare_gcc.cc
@@ -19,8 +19,7 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
 #if defined(__x86_64__)
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc
index d57d3d9d1c..9bb27f1dd1 100644
--- a/third_party/libyuv/source/compare_win.cc
+++ b/third_party/libyuv/source/compare_win.cc
@@ -22,8 +22,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    !defined(__clang__) && defined(_M_IX86)
 
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                const uint8_t* src_b,
@@ -77,8 +78,7 @@ __declspec(naked) uint32_t
   }
 }
 
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_SUMSQUAREERROR_AVX2
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable : 4752)
 __declspec(naked) uint32_t
@@ -118,7 +118,7 @@ __declspec(naked) uint32_t
     ret
   }
 }
-#endif  // _MSC_VER >= 1700
+#endif  // HAS_SUMSQUAREERROR_AVX2
 
 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
 uvec32 kHashMul0 = {
@@ -196,7 +196,7 @@ __declspec(naked) uint32_t
 }
 
 // Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_HASHDJB2_AVX2
 __declspec(naked) uint32_t
     HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
   __asm {
@@ -231,7 +231,7 @@ __declspec(naked) uint32_t
     ret
   }
 }
-#endif  // _MSC_VER >= 1700
+#endif  // HAS_HASHDJB2_AVX2
 
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc
index 1bd596599b..69f7fb6e01 100644
--- a/third_party/libyuv/source/convert.cc
+++ b/third_party/libyuv/source/convert.cc
@@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y,
 }
 
 // Any I[420]1[02] to P[420]1[02] format with mirroring.
-static int Ix1xToPx1x(const uint16_t* src_y,
+static int IxxxToPxxx(const uint16_t* src_y,
                       int src_stride_y,
                       const uint16_t* src_u,
                       int src_stride_u,
@@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                     src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                     width, height, 1, 1, 10);
 }
@@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                     src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                     width, height, 1, 0, 10);
 }
@@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                     src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                     width, height, 1, 1, 12);
 }
@@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                     src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                     width, height, 1, 0, 12);
 }
@@ -1368,6 +1368,18 @@ int ARGBToI420(const uint8_t* src_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
+#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1388,22 +1400,6 @@ int ARGBToI420(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
 #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
   if (TestCpuFlag(kCpuHasMMI)) {
     ARGBToYRow = ARGBToYRow_Any_MMI;
@@ -1771,7 +1767,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
   }
 
 // Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON)
+#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
     RGB24ToYRow = RGB24ToYRow_Any_NEON;
@@ -1808,6 +1804,14 @@ int RGB24ToI420(const uint8_t* src_rgb24,
 #endif
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -1816,6 +1820,18 @@ int RGB24ToI420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1960,6 +1976,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
   }
 #endif
 #else
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -1968,6 +1992,18 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVJRow = ARGBToUVJRow_NEON;
+      }
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
@@ -2111,6 +2147,26 @@ int RAWToI420(const uint8_t* src_raw,
 #endif
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -2186,6 +2242,178 @@ int RAWToI420(const uint8_t* src_raw,
   return 0;
 }
 
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RAW to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+    defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)
+  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RAWToUVJRow_C;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      RAWToYJRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVJRow = RAWToUVJRow_Any_NEON;
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVJRow = RAWToUVJRow_NEON;
+      }
+    }
+  }
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA))
+#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToUVJRow = RAWToUVJRow_Any_MMI;
+    RAWToYJRow = RAWToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYJRow = RAWToYJRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVJRow = RAWToUVJRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVJRow = RAWToUVJRow_Any_MSA;
+    RAWToYJRow = RAWToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_MSA;
+      RAWToUVJRow = RAWToUVJRow_MSA;
+    }
+  }
+#endif
+#else
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVJRow = ARGBToUVJRow_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+      defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+     defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+      RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYJRow(src_raw, dst_y, width);
+      RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+     defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+      RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYJRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+      defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
 // Convert RGB565 to I420.
 LIBYUV_API
 int RGB565ToI420(const uint8_t* src_rgb565,
diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc
index 87d7d73250..d8f7b27738 100644
--- a/third_party/libyuv/source/convert_argb.cc
+++ b/third_party/libyuv/source/convert_argb.cc
@@ -888,6 +888,63 @@ int U010ToAB30(const uint16_t* src_y,
                           &kYuv2020Constants, width, height);
 }
 
+// Convert 12 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I212ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I212TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I212ToAR30Row = I212ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I212TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I212ToAR30Row = I212ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I212ToAR30Row = I212ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
 // Convert 10 bit YUV to ARGB with matrix.
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
@@ -1045,6 +1102,58 @@ int U210ToAB30(const uint16_t* src_y,
                           &kYuv2020Constants, width, height);
 }
 
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToAR30Row = I410ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToAR30Row = I410ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
 // Convert 10 bit YUV to ARGB with matrix.
 LIBYUV_API
 int I010ToARGBMatrix(const uint16_t* src_y,
@@ -1088,14 +1197,6 @@ int I010ToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
-#if defined(HAS_I210TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I210ToARGBRow = I210ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I210ToARGBRow = I210ToARGBRow_MMI;
-    }
-  }
-#endif
   for (y = 0; y < height; ++y) {
     I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
@@ -1216,6 +1317,61 @@ int U010ToABGR(const uint16_t* src_y,
                           width, height);
 }
 
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I212ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I212TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I212ToARGBRow = I212ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I212TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I212ToARGBRow = I212ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I212ToARGBRow = I212ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
 // Convert 10 bit 422 YUV to ARGB with matrix.
 LIBYUV_API
 int I210ToARGBMatrix(const uint16_t* src_y,
@@ -1259,14 +1415,6 @@ int I210ToARGBMatrix(const uint16_t* src_y,
     }
   }
 #endif
-#if defined(HAS_I210TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I210ToARGBRow = I210ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I210ToARGBRow = I210ToARGBRow_MMI;
-    }
-  }
-#endif
   for (y = 0; y < height; ++y) {
     I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
@@ -1385,6 +1533,254 @@ int U210ToABGR(const uint16_t* src_y,
                           width, height);
 }
 
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToARGBRow = I410ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToARGBRow = I410ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToARGBRow)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToARGBRow = P210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToARGBRow = P210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToARGBRow)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToARGBRow = P210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToARGBRow = P210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToAR30Row)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToAR30Row = P210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToAR30Row = P210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToAR30Row)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToAR30Row = P210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToAR30Row = P210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
 // Convert I420 with Alpha to preattenuated ARGB with matrix.
 LIBYUV_API
 int I420AlphaToARGBMatrix(const uint8_t* src_y,
@@ -1903,6 +2299,323 @@ int I444AlphaToABGR(const uint8_t* src_y,
       width, height, attenuate);
 }
 
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I210AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I210AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I410AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
 // Convert I400 to ARGB with matrix.
 LIBYUV_API
 int I400ToARGBMatrix(const uint8_t* src_y,
@@ -2078,6 +2791,10 @@ static const uvec8 kShuffleMaskABGRToARGB = {
 static const uvec8 kShuffleMaskRGBAToARGB = {
     1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
 
+// Shuffle table for converting AR64 to AB64.
+static const uvec8 kShuffleMaskAR64ToAB64 = {
+    4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u};
+
 // Convert BGRA to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8_t* src_bgra,
@@ -2087,7 +2804,7 @@ int BGRAToARGB(const uint8_t* src_bgra,
                int width,
                int height) {
   return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+                     (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
 }
 
 // Convert ARGB to BGRA (same as BGRAToARGB).
@@ -2099,7 +2816,7 @@ int ARGBToBGRA(const uint8_t* src_bgra,
                int width,
                int height) {
   return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+                     (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
 }
 
 // Convert ABGR to ARGB.
@@ -2111,7 +2828,7 @@ int ABGRToARGB(const uint8_t* src_abgr,
                int width,
                int height) {
   return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+                     (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
 }
 
 // Convert ARGB to ABGR to (same as ABGRToARGB).
@@ -2123,7 +2840,7 @@ int ARGBToABGR(const uint8_t* src_abgr,
                int width,
                int height) {
   return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+                     (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
 }
 
 // Convert RGBA to ARGB.
@@ -2135,7 +2852,19 @@ int RGBAToARGB(const uint8_t* src_rgba,
                int width,
                int height) {
   return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
+                     (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height);
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64,
+                     (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height);
 }
 
 // Convert RGB24 to ARGB.
@@ -2644,6 +3373,124 @@ int AR30ToAB30(const uint8_t* src_ar30,
   return 0;
 }
 
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+                        int width) = AR64ToARGBRow_C;
+  if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+    src_stride_ar64 = -src_stride_ar64;
+  }
+  // Coalesce rows.
+  if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar64 = dst_stride_argb = 0;
+  }
+#if defined(HAS_AR64TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      AR64ToARGBRow = AR64ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_AR64TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AR64ToARGBRow = AR64ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      AR64ToARGBRow = AR64ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_AR64TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AR64ToARGBRow = AR64ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      AR64ToARGBRow = AR64ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AR64ToARGBRow(src_ar64, dst_argb, width);
+    src_ar64 += src_stride_ar64;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+               int src_stride_ab64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+                        int width) = AB64ToARGBRow_C;
+  if (!src_ab64 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ab64 = src_ab64 + (height - 1) * src_stride_ab64;
+    src_stride_ab64 = -src_stride_ab64;
+  }
+  // Coalesce rows.
+  if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ab64 = dst_stride_argb = 0;
+  }
+#if defined(HAS_AB64TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      AB64ToARGBRow = AB64ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_AB64TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AB64ToARGBRow = AB64ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      AB64ToARGBRow = AB64ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_AB64TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AB64ToARGBRow = AB64ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      AB64ToARGBRow = AB64ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AB64ToARGBRow(src_ab64, dst_argb, width);
+    src_ab64 += src_stride_ab64;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
 // Convert NV12 to ARGB with matrix.
 LIBYUV_API
 int NV12ToARGBMatrix(const uint8_t* src_y,
@@ -4463,6 +5310,40 @@ int H420ToAR30(const uint8_t* src_y,
                           &kYvuH709Constants, width, height);
 }
 
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc
index 4ba4bb5e0f..e14615847d 100644
--- a/third_party/libyuv/source/convert_from_argb.cc
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -2009,6 +2009,124 @@ int ARGBToJ422(const uint8_t* src_argb,
   return 0;
 }
 
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAR64Row_C;
+  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar64 = 0;
+  }
+#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAR64Row(src_argb, dst_ar64, width);
+    src_argb += src_stride_argb;
+    dst_ar64 += dst_stride_ar64;
+  }
+  return 0;
+}
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAB64Row_C;
+  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ab64 = 0;
+  }
+#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAB64Row(src_argb, dst_ab64, width);
+    src_argb += src_stride_argb;
+    dst_ab64 += dst_stride_ab64;
+  }
+  return 0;
+}
+
 // Convert ARGB to J400.
 LIBYUV_API
 int ARGBToJ400(const uint8_t* src_argb,
diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc
index 219c216509..7cea06c8d7 100644
--- a/third_party/libyuv/source/planar_functions.cc
+++ b/third_party/libyuv/source/planar_functions.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/planar_functions.h"
 
+#include <assert.h>
 #include <string.h>  // for memset()
 
 #include "libyuv/cpu_id.h"
@@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv,
                      int height,
                      int depth) {
   int y;
-  int scale = 1 << depth;
-  void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
-                     int scale, int width) = SplitUVRow_16_C;
+  void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
+                        uint16_t* dst_v, int depth, int width) =
+      SplitUVRow_16_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv,
   }
 #if defined(HAS_SPLITUVROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_16_Any_AVX2;
+    SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_16_AVX2;
+      SplitUVRow_16 = SplitUVRow_16_AVX2;
     }
   }
 #endif
 #if defined(HAS_SPLITUVROW_16_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_16_Any_NEON;
+    SplitUVRow_16 = SplitUVRow_16_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_16_NEON;
+      SplitUVRow_16 = SplitUVRow_16_NEON;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, scale, width);
+    SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
     src_uv += src_stride_uv;
@@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
                      int height,
                      int depth) {
   int y;
-  int scale = 1 << (16 - depth);
-  void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v,
-                     uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C;
+  void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
+                        uint16_t* dst_uv, int depth, int width) =
+      MergeUVRow_16_C;
+  assert(depth >= 8);
+  assert(depth <= 16);
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u,
   }
 #if defined(HAS_MERGEUVROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_16_Any_AVX2;
+    MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      MergeUVRow = MergeUVRow_16_AVX2;
+      MergeUVRow_16 = MergeUVRow_16_AVX2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_16_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_16_Any_NEON;
+    MergeUVRow_16 = MergeUVRow_16_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      MergeUVRow = MergeUVRow_16_NEON;
+      MergeUVRow_16 = MergeUVRow_16_NEON;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
-    MergeUVRow(src_u, src_v, dst_uv, scale, width);
+    MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
     src_u += src_stride_u;
     src_v += src_stride_v;
     dst_uv += dst_stride_uv;
@@ -671,8 +674,8 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
                           int depth) {
   int y;
   int scale = 1 << (16 - depth);
-  void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
-                      int width) = MultiplyRow_16_C;
+  void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+                         int width) = MultiplyRow_16_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
 
 #if defined(HAS_MULTIPLYROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    MultiplyRow = MultiplyRow_16_Any_AVX2;
+    MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      MultiplyRow = MultiplyRow_16_AVX2;
+      MultiplyRow_16 = MultiplyRow_16_AVX2;
     }
   }
 #endif
 #if defined(HAS_MULTIPLYROW_16_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    MultiplyRow = MultiplyRow_16_Any_NEON;
+    MultiplyRow_16 = MultiplyRow_16_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      MultiplyRow = MultiplyRow_16_NEON;
+      MultiplyRow_16 = MultiplyRow_16_NEON;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    MultiplyRow(src_y, dst_y, scale, width);
+    MultiplyRow_16(src_y, dst_y, scale, width);
     src_y += src_stride_y;
     dst_y += dst_stride_y;
   }
@@ -982,6 +985,142 @@ void MergeRGBPlane(const uint8_t* src_r,
   }
 }
 
+LIBYUV_NOINLINE
+void SplitARGBPlaneAlpha(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_r,
+                         int dst_stride_r,
+                         uint8_t* dst_g,
+                         int dst_stride_g,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         int width,
+                         int height) {
+  int y;
+  void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                       uint8_t* dst_b, uint8_t* dst_a, int width) =
+      SplitARGBRow_C;
+
+  assert(height > 0);
+
+  if (src_stride_argb == width * 4 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
+        dst_stride_a = 0;
+  }
+
+#if defined(HAS_SPLITARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitARGBRow = SplitARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      SplitARGBRow = SplitARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitARGBRow = SplitARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      SplitARGBRow = SplitARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitARGBRow = SplitARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitARGBRow = SplitARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitARGBRow = SplitARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitARGBRow = SplitARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    dst_a += dst_stride_a;
+    src_argb += src_stride_argb;
+  }
+}
+
+LIBYUV_NOINLINE
+void SplitARGBPlaneOpaque(const uint8_t* src_argb,
+                          int src_stride_argb,
+                          uint8_t* dst_r,
+                          int dst_stride_r,
+                          uint8_t* dst_g,
+                          int dst_stride_g,
+                          uint8_t* dst_b,
+                          int dst_stride_b,
+                          int width,
+                          int height) {
+  int y;
+  void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                       uint8_t* dst_b, int width) = SplitXRGBRow_C;
+  assert(height > 0);
+
+  if (src_stride_argb == width * 4 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+
+#if defined(HAS_SPLITXRGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitXRGBRow = SplitXRGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      SplitXRGBRow = SplitXRGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      SplitXRGBRow = SplitXRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitXRGBRow = SplitXRGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitXRGBRow = SplitXRGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitXRGBRow = SplitXRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitXRGBRow = SplitXRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_argb += src_stride_argb;
+  }
+}
+
 LIBYUV_API
 void SplitARGBPlane(const uint8_t* src_argb,
                     int src_stride_argb,
@@ -995,137 +1134,146 @@ void SplitARGBPlane(const uint8_t* src_argb,
                     int dst_stride_a,
                     int width,
                     int height) {
-  int y;
-  void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
-                       uint8_t* dst_b, uint8_t* dst_a, int width) =
-      SplitARGBRow_C;
-  void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
-                       uint8_t* dst_b, int width) = SplitXRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_a = dst_a + (height - 1) * dst_stride_a;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+    dst_stride_a = -dst_stride_a;
+  }
 
   if (dst_a == NULL) {
-    // Negative height means invert the image.
-    if (height < 0) {
-      height = -height;
-      dst_r = dst_r + (height - 1) * dst_stride_r;
-      dst_g = dst_g + (height - 1) * dst_stride_g;
-      dst_b = dst_b + (height - 1) * dst_stride_b;
-      dst_stride_r = -dst_stride_r;
-      dst_stride_g = -dst_stride_g;
-      dst_stride_b = -dst_stride_b;
-    }
-
-    // Coalesce rows.
-    if (src_stride_argb == width * 4 && dst_stride_r == width &&
-        dst_stride_g == width && dst_stride_b == width) {
-      width *= height;
-      height = 1;
-      src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
-          dst_stride_a = 0;
-    }
+    SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+                         dst_stride_g, dst_b, dst_stride_b, width, height);
+  } else {
+    SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+                        dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a,
+                        width, height);
+  }
+}
 
-#if defined(HAS_SPLITARGBROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      SplitXRGBRow = SplitXRGBRow_Any_SSE2;
-      if (IS_ALIGNED(width, 8)) {
-        SplitXRGBRow = SplitXRGBRow_SSE2;
-      }
-    }
-#endif
-#if defined(HAS_SPLITARGBROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
-      if (IS_ALIGNED(width, 8)) {
-        SplitXRGBRow = SplitXRGBRow_SSSE3;
-      }
+LIBYUV_NOINLINE
+void MergeARGBPlaneAlpha(const uint8_t* src_r,
+                         int src_stride_r,
+                         const uint8_t* src_g,
+                         int src_stride_g,
+                         const uint8_t* src_b,
+                         int src_stride_b,
+                         const uint8_t* src_a,
+                         int src_stride_a,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
+  int y;
+  void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                       const uint8_t* src_b, const uint8_t* src_a,
+                       uint8_t* dst_argb, int width) = MergeARGBRow_C;
+
+  assert(height > 0);
+
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      src_stride_a == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+        dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeARGBRow = MergeARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      MergeARGBRow = MergeARGBRow_SSE2;
     }
+  }
 #endif
-#if defined(HAS_SPLITARGBROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      SplitXRGBRow = SplitXRGBRow_Any_AVX2;
-      if (IS_ALIGNED(width, 16)) {
-        SplitXRGBRow = SplitXRGBRow_AVX2;
-      }
+#if defined(HAS_MERGEARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeARGBRow = MergeARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeARGBRow = MergeARGBRow_AVX2;
     }
+  }
 #endif
-#if defined(HAS_SPLITRGBROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      SplitXRGBRow = SplitXRGBRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        SplitXRGBRow = SplitXRGBRow_NEON;
-      }
+#if defined(HAS_MERGEARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeARGBRow = MergeARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeARGBRow = MergeARGBRow_NEON;
     }
+  }
 #endif
 
-    for (y = 0; y < height; ++y) {
-      SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
-      dst_r += dst_stride_r;
-      dst_g += dst_stride_g;
-      dst_b += dst_stride_b;
-      src_argb += src_stride_argb;
-    }
-  } else {
-    if (height < 0) {
-      height = -height;
-      dst_r = dst_r + (height - 1) * dst_stride_r;
-      dst_g = dst_g + (height - 1) * dst_stride_g;
-      dst_b = dst_b + (height - 1) * dst_stride_b;
-      dst_a = dst_a + (height - 1) * dst_stride_a;
-      dst_stride_r = -dst_stride_r;
-      dst_stride_g = -dst_stride_g;
-      dst_stride_b = -dst_stride_b;
-      dst_stride_a = -dst_stride_a;
-    }
-
-    if (src_stride_argb == width * 4 && dst_stride_r == width &&
-        dst_stride_g == width && dst_stride_b == width &&
-        dst_stride_a == width) {
-      width *= height;
-      height = 1;
-      src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
-          dst_stride_a = 0;
-    }
+  for (y = 0; y < height; ++y) {
+    MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    src_a += src_stride_a;
+    dst_argb += dst_stride_argb;
+  }
+}
 
-#if defined(HAS_SPLITARGBROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      SplitARGBRow = SplitARGBRow_Any_SSE2;
-      if (IS_ALIGNED(width, 8)) {
-        SplitARGBRow = SplitARGBRow_SSE2;
-      }
-    }
-#endif
-#if defined(HAS_SPLITARGBROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      SplitARGBRow = SplitARGBRow_Any_SSSE3;
-      if (IS_ALIGNED(width, 8)) {
-        SplitARGBRow = SplitARGBRow_SSSE3;
-      }
+LIBYUV_NOINLINE
+void MergeARGBPlaneOpaque(const uint8_t* src_r,
+                          int src_stride_r,
+                          const uint8_t* src_g,
+                          int src_stride_g,
+                          const uint8_t* src_b,
+                          int src_stride_b,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          int width,
+                          int height) {
+  int y;
+  void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                       const uint8_t* src_b, uint8_t* dst_argb, int width) =
+      MergeXRGBRow_C;
+
+  assert(height > 0);
+
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEXRGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeXRGBRow = MergeXRGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      MergeXRGBRow = MergeXRGBRow_SSE2;
     }
+  }
 #endif
-#if defined(HAS_SPLITARGBROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      SplitARGBRow = SplitARGBRow_Any_AVX2;
-      if (IS_ALIGNED(width, 16)) {
-        SplitARGBRow = SplitARGBRow_AVX2;
-      }
+#if defined(HAS_MERGEXRGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXRGBRow = MergeXRGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXRGBRow = MergeXRGBRow_AVX2;
     }
+  }
 #endif
-#if defined(HAS_SPLITRGBROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      SplitARGBRow = SplitARGBRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        SplitARGBRow = SplitARGBRow_NEON;
-      }
+#if defined(HAS_MERGEXRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeXRGBRow = MergeXRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXRGBRow = MergeXRGBRow_NEON;
     }
+  }
 #endif
 
-    for (y = 0; y < height; ++y) {
-      SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
-      dst_r += dst_stride_r;
-      dst_g += dst_stride_g;
-      dst_b += dst_stride_b;
-      dst_a += dst_stride_a;
-      src_argb += src_stride_argb;
-    }
+  for (y = 0; y < height; ++y) {
+    MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_argb += dst_stride_argb;
   }
 }
 
@@ -1142,107 +1290,357 @@ void MergeARGBPlane(const uint8_t* src_r,
                     int dst_stride_argb,
                     int width,
                     int height) {
-  int y;
-  void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
-                       const uint8_t* src_b, const uint8_t* src_a,
-                       uint8_t* dst_argb, int width) = MergeARGBRow_C;
-  void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
-                       const uint8_t* src_b, uint8_t* dst_argb, int width) =
-      MergeXRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
 
   if (src_a == NULL) {
-    // Negative height means invert the image.
-    if (height < 0) {
-      height = -height;
-      dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-      dst_stride_argb = -dst_stride_argb;
-    }
-    // Coalesce rows.
-    if (src_stride_r == width && src_stride_g == width &&
-        src_stride_b == width && dst_stride_argb == width * 4) {
-      width *= height;
-      height = 1;
-      src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+    MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                         src_stride_b, dst_argb, dst_stride_argb, width,
+                         height);
+  } else {
+    MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                        src_stride_b, src_a, src_stride_a, dst_argb,
+                        dst_stride_argb, width, height);
+  }
+}
+
+// TODO(yuan): Support 2 bit alpha channel.
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    uint8_t* dst_ar30,
+                    int dst_stride_ar30,
+                    int width,
+                    int height,
+                    int depth) {
+  int y;
+  void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
+                       const uint16_t* src_b, uint8_t* dst_ar30, int depth,
+                       int width) = MergeXR30Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_MERGEXR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXR30Row = MergeXR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXR30Row = MergeXR30Row_AVX2;
     }
-#if defined(HAS_MERGEARGBROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      MergeXRGBRow = MergeXRGBRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_MERGEXR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (depth == 10) {
+      MergeXR30Row = MergeXR30Row_10_Any_NEON;
+      if (IS_ALIGNED(width, 8)) {
+        MergeXR30Row = MergeXR30Row_10_NEON;
+      }
+    } else {
+      MergeXR30Row = MergeXR30Row_Any_NEON;
       if (IS_ALIGNED(width, 8)) {
-        MergeXRGBRow = MergeXRGBRow_SSE2;
+        MergeXR30Row = MergeXR30Row_NEON;
       }
     }
+  }
 #endif
-#if defined(HAS_MERGEARGBROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      MergeXRGBRow = MergeXRGBRow_Any_AVX2;
-      if (IS_ALIGNED(width, 16)) {
-        MergeXRGBRow = MergeXRGBRow_AVX2;
-      }
+
+  for (y = 0; y < height; ++y) {
+    MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_ar30 += dst_stride_ar30;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneAlpha(const uint16_t* src_r,
+                                int src_stride_r,
+                                const uint16_t* src_g,
+                                int src_stride_g,
+                                const uint16_t* src_b,
+                                int src_stride_b,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint16_t* dst_ar64,
+                                int dst_stride_ar64,
+                                int width,
+                                int height,
+                                int depth) {
+  int y;
+  void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+                       const uint16_t* src_b, const uint16_t* src_a,
+                       uint16_t* dst_argb, int depth, int width) =
+      MergeAR64Row_C;
+
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      src_stride_a == width && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+        dst_stride_ar64 = 0;
+  }
+#if defined(HAS_MERGEAR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeAR64Row = MergeAR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeAR64Row = MergeAR64Row_AVX2;
     }
+  }
 #endif
-#if defined(HAS_MERGERGBROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      MergeXRGBRow = MergeXRGBRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        MergeXRGBRow = MergeXRGBRow_NEON;
-      }
+#if defined(HAS_MERGEAR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeAR64Row = MergeAR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeAR64Row = MergeAR64Row_NEON;
     }
+  }
 #endif
 
-    for (y = 0; y < height; ++y) {
-      MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
-      src_r += src_stride_r;
-      src_g += src_stride_g;
-      src_b += src_stride_b;
-      dst_argb += dst_stride_argb;
+  for (y = 0; y < height; ++y) {
+    MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    src_a += src_stride_a;
+    dst_ar64 += dst_stride_ar64;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneOpaque(const uint16_t* src_r,
+                                 int src_stride_r,
+                                 const uint16_t* src_g,
+                                 int src_stride_g,
+                                 const uint16_t* src_b,
+                                 int src_stride_b,
+                                 uint16_t* dst_ar64,
+                                 int dst_stride_ar64,
+                                 int width,
+                                 int height,
+                                 int depth) {
+  int y;
+  void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+                       const uint16_t* src_b, uint16_t* dst_argb, int depth,
+                       int width) = MergeXR64Row_C;
+
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
+  }
+#if defined(HAS_MERGEXR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXR64Row = MergeXR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXR64Row = MergeXR64Row_AVX2;
     }
-  } else {
-    if (height < 0) {
-      height = -height;
-      dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-      dst_stride_argb = -dst_stride_argb;
+  }
+#endif
+#if defined(HAS_MERGEXR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeXR64Row = MergeXR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeXR64Row = MergeXR64Row_NEON;
     }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_ar64 += dst_stride_ar64;
+  }
+}
+
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    const uint16_t* src_a,
+                    int src_stride_a,
+                    uint16_t* dst_ar64,
+                    int dst_stride_ar64,
+                    int width,
+                    int height,
+                    int depth) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
+    dst_stride_ar64 = -dst_stride_ar64;
+  }
+
+  if (src_a == NULL) {
+    MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                         src_stride_b, dst_ar64, dst_stride_ar64, width, height,
+                         depth);
+  } else {
+    MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                        src_stride_b, src_a, src_stride_a, dst_ar64,
+                        dst_stride_ar64, width, height, depth);
+  }
+}
 
-    if (src_stride_r == width && src_stride_g == width &&
-        src_stride_b == width && src_stride_a == width &&
-        dst_stride_argb == width * 4) {
-      width *= height;
-      height = 1;
-      src_stride_r = src_stride_g = src_stride_b = src_stride_a =
-          dst_stride_argb = 0;
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r,
+                                     int src_stride_r,
+                                     const uint16_t* src_g,
+                                     int src_stride_g,
+                                     const uint16_t* src_b,
+                                     int src_stride_b,
+                                     const uint16_t* src_a,
+                                     int src_stride_a,
+                                     uint8_t* dst_argb,
+                                     int dst_stride_argb,
+                                     int width,
+                                     int height,
+                                     int depth) {
+  int y;
+  void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+                            const uint16_t* src_b, const uint16_t* src_a,
+                            uint8_t* dst_argb, int depth, int width) =
+      MergeARGB16To8Row_C;
+
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      src_stride_a == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+        dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
     }
-#if defined(HAS_MERGEARGBROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      MergeARGBRow = MergeARGBRow_Any_SSE2;
-      if (IS_ALIGNED(width, 8)) {
-        MergeARGBRow = MergeARGBRow_SSE2;
-      }
+  }
+#endif
+#if defined(HAS_MERGEARGB16TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeARGB16To8Row = MergeARGB16To8Row_NEON;
     }
+  }
 #endif
-#if defined(HAS_MERGEARGBROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      MergeARGBRow = MergeARGBRow_Any_AVX2;
-      if (IS_ALIGNED(width, 16)) {
-        MergeARGBRow = MergeARGBRow_AVX2;
-      }
+
+  for (y = 0; y < height; ++y) {
+    MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    src_a += src_stride_a;
+    dst_argb += dst_stride_argb;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r,
+                                      int src_stride_r,
+                                      const uint16_t* src_g,
+                                      int src_stride_g,
+                                      const uint16_t* src_b,
+                                      int src_stride_b,
+                                      uint8_t* dst_argb,
+                                      int dst_stride_argb,
+                                      int width,
+                                      int height,
+                                      int depth) {
+  int y;
+  void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+                            const uint16_t* src_b, uint8_t* dst_argb, int depth,
+                            int width) = MergeXRGB16To8Row_C;
+
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
     }
+  }
 #endif
-#if defined(HAS_MERGERGBROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      MergeARGBRow = MergeARGBRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        MergeARGBRow = MergeARGBRow_NEON;
-      }
+#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
     }
+  }
 #endif
 
-    for (y = 0; y < height; ++y) {
-      MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
-      src_r += src_stride_r;
-      src_g += src_stride_g;
-      src_b += src_stride_b;
-      dst_argb += dst_stride_argb;
-    }
+  for (y = 0; y < height; ++y) {
+    MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_argb += dst_stride_argb;
+  }
+}
+
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+                         int src_stride_r,
+                         const uint16_t* src_g,
+                         int src_stride_g,
+                         const uint16_t* src_b,
+                         int src_stride_b,
+                         const uint16_t* src_a,
+                         int src_stride_a,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height,
+                         int depth) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  if (src_a == NULL) {
+    MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                              src_stride_b, dst_argb, dst_stride_argb, width,
+                              height, depth);
+  } else {
+    MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                             src_stride_b, src_a, src_stride_a, dst_argb,
+                             dst_stride_argb, width, height, depth);
   }
 }
 
@@ -2244,12 +2642,12 @@ int ARGBAdd(const uint8_t* src_argb0,
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_SSE2;
   }
 #endif
-#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
@@ -3527,6 +3925,76 @@ int ARGBShuffle(const uint8_t* src_bgra,
   return 0;
 }
 
+// Shuffle AR64 channel order.  e.g. AR64 to AB64.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+                int src_stride_ar64,
+                uint16_t* dst_ar64,
+                int dst_stride_ar64,
+                const uint8_t* shuffler,
+                int width,
+                int height) {
+  int y;
+  void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64,
+                         const uint8_t* shuffler, int width) = AR64ShuffleRow_C;
+  if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+    src_stride_ar64 = -src_stride_ar64;
+  }
+  // Coalesce rows.
+  if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar64 = dst_stride_ar64 = 0;
+  }
+  // Assembly versions can be reused if it's implemented with shuffle.
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      AR64ShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      AR64ShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      AR64ShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      AR64ShuffleRow = ARGBShuffleRow_MMI;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler,
+                   width * 2);
+    src_ar64 += src_stride_ar64;
+    dst_ar64 += dst_stride_ar64;
+  }
+  return 0;
+}
+
 // Gauss blur a float plane using Gaussian 5x5 filter with
 // coefficients of 1, 4, 6, 4, 1.
 // Each destination pixel is a blur of the 5x5
diff --git a/third_party/libyuv/source/rotate_gcc.cc b/third_party/libyuv/source/rotate_gcc.cc
index fd359d4ae6..1a3f8cbbda 100644
--- a/third_party/libyuv/source/rotate_gcc.cc
+++ b/third_party/libyuv/source/rotate_gcc.cc
@@ -17,8 +17,7 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
 #if defined(HAS_TRANSPOSEWX8_SSSE3)
diff --git a/third_party/libyuv/source/rotate_win.cc b/third_party/libyuv/source/rotate_win.cc
index e887dd525c..a78873f843 100644
--- a/third_party/libyuv/source/rotate_win.cc
+++ b/third_party/libyuv/source/rotate_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    !defined(__clang__) && defined(_M_IX86)
 
 __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
                                           int src_stride,
diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
index 08ae1d2af7..c9a402eda2 100644
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
@@ -61,6 +61,8 @@ ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
 ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
 #endif
 
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
 // Any 4 planes to 1 with yuvconstants
 #define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
@@ -77,6 +79,10 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
     memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
     memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
     memcpy(temp + 192, a_buf + n, r);                                        \
+    if (width & 1) {                                                         \
+      temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
+    }                                                                        \
     ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
              yuvconstants, MASK + 1);                                        \
     memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
@@ -115,6 +121,124 @@ ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
 #endif
 #undef ANY41C
 
+// Any 4 planes to 1 plane of 8 bit with yuvconstants
+#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants,      \
+               int width) {                                                    \
+    SIMD_ALIGNED(T temp[16 * 4]);                                              \
+    SIMD_ALIGNED(uint8_t out[64]);                                             \
+    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                    \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);          \
+    }                                                                          \
+    memcpy(temp, y_buf + n, r * SBPP);                                         \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
+    memcpy(temp + 48, a_buf + n, r * SBPP);                                    \
+    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants,         \
+             MASK + 1);                                                        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+  }
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
+        I210AlphaToARGBRow_SSSE3,
+        1,
+        0,
+        uint16_t,
+        2,
+        4,
+        7)
+#endif
+
+#ifdef HAS_I210ALPHATOARGBROW_AVX2
+ANY41CT(I210AlphaToARGBRow_Any_AVX2,
+        I210AlphaToARGBRow_AVX2,
+        1,
+        0,
+        uint16_t,
+        2,
+        4,
+        15)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
+        I410AlphaToARGBRow_SSSE3,
+        0,
+        0,
+        uint16_t,
+        2,
+        4,
+        7)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_AVX2
+ANY41CT(I410AlphaToARGBRow_Any_AVX2,
+        I410AlphaToARGBRow_AVX2,
+        0,
+        0,
+        uint16_t,
+        2,
+        4,
+        15)
+#endif
+
+#undef ANY41CT
+
+// Any 4 planes to 1 plane with parameter
+#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
+  void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+               const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
+    SIMD_ALIGNED(STYPE temp[16 * 4]);                                      \
+    SIMD_ALIGNED(DTYPE out[64]);                                           \
+    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n);             \
+    }                                                                      \
+    memcpy(temp, r_buf + n, r * SBPP);                                     \
+    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
+    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
+    memcpy(temp + 48, a_buf + n, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
+    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
+  }
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEAR64ROW_NEON
+ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+ANY41PT(MergeARGB16To8Row_Any_AVX2,
+        MergeARGB16To8Row_AVX2,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        15)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_NEON
+ANY41PT(MergeARGB16To8Row_Any_NEON,
+        MergeARGB16To8Row_NEON,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        7)
+#endif
+
+#undef ANY41PT
+
 // Any 3 planes to 1.
 #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
@@ -144,13 +268,13 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
 #ifdef HAS_MERGERGBROW_MMI
 ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
 #endif
-#ifdef HAS_MERGEARGBROW_SSE2
+#ifdef HAS_MERGEXRGBROW_SSE2
 ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
 #endif
-#ifdef HAS_MERGEARGBROW_AVX2
+#ifdef HAS_MERGEXRGBROW_AVX2
 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
 #endif
-#ifdef HAS_MERGEARGBROW_NEON
+#ifdef HAS_MERGEXRGBROW_NEON
 ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
 #endif
 #ifdef HAS_I422TOYUY2ROW_SSE2
@@ -327,11 +451,99 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #ifdef HAS_I210TOAR30ROW_AVX2
 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
+#ifdef HAS_I410TOAR30ROW_SSSE3
+ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_SSSE3
+ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_AVX2
+ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I410TOAR30ROW_AVX2
+ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
 #ifdef HAS_I210TOARGBROW_MMI
 ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
 #endif
+#ifdef HAS_I212TOAR30ROW_SSSE3
+ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_SSSE3
+ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_AVX2
+ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_AVX2
+ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
 #undef ANY31CT
 
+// Any 3 planes to 1 plane with parameter
+#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
+  void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+               DTYPE* dst_ptr, int depth, int width) {                     \
+    SIMD_ALIGNED(STYPE temp[16 * 3]);                                      \
+    SIMD_ALIGNED(DTYPE out[64]);                                           \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n);                    \
+    }                                                                      \
+    memcpy(temp, r_buf + n, r * SBPP);                                     \
+    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
+    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1);            \
+    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
+  }
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
+#endif
+
+#ifdef HAS_MERGEXR30ROW_NEON
+ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
+ANY31PT(MergeXR30Row_10_Any_NEON,
+        MergeXR30Row_10_NEON,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        3)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_NEON
+ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+ANY31PT(MergeXRGB16To8Row_Any_AVX2,
+        MergeXRGB16To8Row_AVX2,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        15)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_NEON
+ANY31PT(MergeXRGB16To8Row_Any_NEON,
+        MergeXRGB16To8Row_NEON,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        7)
+#endif
+
+#undef ANY31PT
+
 // Any 2 planes to 1.
 #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
@@ -546,12 +758,57 @@ ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
 #endif
 #undef ANY21C
 
+// Any 2 planes of 16 bit to 1 with yuvconstants
+#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
+  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,              \
+               const struct YuvConstants* yuvconstants, int width) {           \
+    SIMD_ALIGNED(T temp[16 * 3]);                                              \
+    SIMD_ALIGNED(uint8_t out[64]);                                             \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                    \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                       \
+    }                                                                          \
+    memcpy(temp, y_buf + n, r * SBPP);                                         \
+    memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
+    ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1);                    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+  }
+
+#ifdef HAS_P210TOAR30ROW_SSSE3
+ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_SSSE3
+ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_AVX2
+ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P210TOAR30ROW_AVX2
+ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_SSSE3
+ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_SSSE3
+ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_AVX2
+ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_AVX2
+ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+
+#undef ANY21CT
+
 // Any 2 16 bit planes with parameter to 1
 #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                     \
   void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
                int width) {                                          \
     SIMD_ALIGNED(T temp[16 * 4]);                                    \
-    memset(temp, 0, 16 * 4); /* for msan */                          \
+    memset(temp, 0, 16 * 4 * BPP); /* for msan */                    \
     int r = width & MASK;                                            \
     int n = width & ~MASK;                                           \
     if (n > 0) {                                                     \
@@ -1100,6 +1357,72 @@ ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
 #undef ANY11P
 #undef ANY11P
 
+// Any 1 to 1 with type
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)  \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]);                \
+    SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]);                  \
+    memset(temp, 0, (MASK + 1) * SBPP); /* for msan */            \
+    int r = width & MASK;                                         \
+    int n = width & ~MASK;                                        \
+    if (n > 0) {                                                  \
+      ANY_SIMD(src_ptr, dst_ptr, n);                              \
+    }                                                             \
+    memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);       \
+    ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1);                \
+    memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP);          \
+  }
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_SSSE3
+ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_SSSE3
+ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_NEON
+ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_NEON
+ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#undef ANY11T
+
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
 #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
   void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
@@ -1266,38 +1589,38 @@ ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
 #undef ANY11C
 
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
-               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
-    memset(temp, 0, 64 * 2); /* for msan */                                  \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-    }                                                                        \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \
+               int width, int source_y_fraction) {                             \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                        \
+    memset(temp, 0, 64 * 2); /* for msan */                                    \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);            \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP);              \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);               \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                            \
   }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
-ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
-ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_MSA
-ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
 #endif
 #ifdef HAS_INTERPOLATEROW_MMI
-ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
+ANY11I(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
 #endif
-#undef ANY11T
+#undef ANY11I
 
 // Any 1 to 1 mirror.
 #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
@@ -1508,16 +1831,16 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
 #ifdef HAS_SPLITRGBROW_MMI
 ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
 #endif
-#ifdef HAS_SPLITARGBROW_SSE2
+#ifdef HAS_SPLITXRGBROW_SSE2
 ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
 #endif
-#ifdef HAS_SPLITARGBROW_SSSE3
+#ifdef HAS_SPLITXRGBROW_SSSE3
 ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
 #endif
-#ifdef HAS_SPLITARGBROW_AVX2
+#ifdef HAS_SPLITXRGBROW_AVX2
 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
 #endif
-#ifdef HAS_SPLITARGBROW_NEON
+#ifdef HAS_SPLITXRGBROW_NEON
 ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
 #endif
 
@@ -1557,17 +1880,17 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
 #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
                uint8_t* dst_v, int width) {                                  \
     SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
     memset(temp, 0, 128 * 2); /* for msan */                                 \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
     }                                                                        \
     memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
            SS(r, UVSHIFT) * BPP);                                            \
     if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
       memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
@@ -1714,17 +2037,17 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
 // Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
 // 128 byte row allows for 32 avx ARGB pixels.
 #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu,  \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu,      \
                int width) {                                                  \
     SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
     memset(temp, 0, 128 * 2); /* for msan */                                 \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n);                          \
+      ANY_SIMD(src_ptr, src_stride, dst_vu, n);                              \
     }                                                                        \
     memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
            SS(r, UVSHIFT) * BPP);                                            \
     if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
       memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
index a941c3f5fc..c6e412414e 100644
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/row.h"
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>  // For memcpy and memset.
 
@@ -21,10 +22,14 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// The following ifdef from row_win makes the C code match the row_win code,
-// which is 7 bit fixed point.
+// This macro control YUV to RGB using unsigned math to extend range of
+// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
+// LIBYUV_UNLIMITED_DATA
+
+// The following macro from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point for ARGBToI420:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
 #define LIBYUV_RGB7 1
 #endif
 
@@ -50,6 +55,11 @@ static __inline int32_t clamp1023(int32_t v) {
   return (-(v >= 1023) | v) & 1023;
 }
 
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (-(v >= max) | v) & max;
+}
+
 static __inline uint32_t Abs(int32_t v) {
   int m = -(v < 0);
   return (v + m) ^ m;
@@ -67,6 +77,10 @@ static __inline int32_t clamp1023(int32_t v) {
   return (v > 1023) ? 1023 : v;
 }
 
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (v > max) ? max : v;
+}
+
 static __inline uint32_t Abs(int32_t v) {
   return (v < 0) ? -v : v;
 }
@@ -413,6 +427,82 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
   }
 }
 
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = src_argb[0] * 0x0101;
+    dst_ar64[1] = src_argb[1] * 0x0101;
+    dst_ar64[2] = src_argb[2] * 0x0101;
+    dst_ar64[3] = src_argb[3] * 0x0101;
+    dst_ar64 += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_ab64[0] = src_argb[2] * 0x0101;
+    dst_ab64[1] = src_argb[1] * 0x0101;
+    dst_ab64[2] = src_argb[0] * 0x0101;
+    dst_ab64[3] = src_argb[3] * 0x0101;
+    dst_ab64 += 4;
+    src_argb += 4;
+  }
+}
+
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = src_ar64[0] >> 8;
+    dst_argb[1] = src_ar64[1] >> 8;
+    dst_argb[2] = src_ar64[2] >> 8;
+    dst_argb[3] = src_ar64[3] >> 8;
+    dst_argb += 4;
+    src_ar64 += 4;
+  }
+}
+
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = src_ab64[2] >> 8;
+    dst_argb[1] = src_ab64[1] >> 8;
+    dst_argb[2] = src_ab64[0] >> 8;
+    dst_argb[3] = src_ab64[3] >> 8;
+    dst_argb += 4;
+    src_ab64 += 4;
+  }
+}
+
+// TODO(fbarchard): Make shuffle compatible with SIMD versions
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+                      uint8_t* dst_ar64,
+                      const uint8_t* shuffler,
+                      int width) {
+  const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
+  uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
+  int index0 = shuffler[0] / 2;
+  int index1 = shuffler[2] / 2;
+  int index2 = shuffler[4] / 2;
+  int index3 = shuffler[6] / 2;
+  // Shuffle a row of AR64.
+  int x;
+  for (x = 0; x < width / 2; ++x) {
+    // To support in-place conversion.
+    uint16_t b = src_ar64_16[index0];
+    uint16_t g = src_ar64_16[index1];
+    uint16_t r = src_ar64_16[index2];
+    uint16_t a = src_ar64_16[index3];
+    dst_ar64_16[0] = b;
+    dst_ar64_16[1] = g;
+    dst_ar64_16[2] = r;
+    dst_ar64_16[3] = a;
+    src_ar64_16 += 4;
+    dst_ar64_16 += 4;
+  }
+}
+
 #ifdef LIBYUV_RGB7
 // Old 7 bit math for compatibility on unsupported platforms.
 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
@@ -462,80 +552,80 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
 // Intel version mimic SSE/AVX which does 2 pavgb
 #if LIBYUV_ARGBTOUV_PAVGB
 
-#define MAKEROWY(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                   \
-    for (x = 0; x < width; ++x) {                                            \
-      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                      \
-      dst_y += 1;                                                            \
-    }                                                                        \
-  }                                                                          \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
-    int x;                                                                   \
-    for (x = 0; x < width - 1; x += 2) {                                     \
-      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                      \
-                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));         \
-      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                      \
-                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));         \
-      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                      \
-                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));         \
-      dst_u[0] = RGBToU(ar, ag, ab);                                         \
-      dst_v[0] = RGBToV(ar, ag, ab);                                         \
-      src_rgb0 += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                   \
-      dst_u += 1;                                                            \
-      dst_v += 1;                                                            \
-    }                                                                        \
-    if (width & 1) {                                                         \
-      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                           \
-      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                           \
-      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                           \
-      dst_u[0] = RGBToU(ar, ag, ab);                                         \
-      dst_v[0] = RGBToV(ar, ag, ab);                                         \
-    }                                                                        \
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+    }                                                                      \
   }
 #else
 // ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWY(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                   \
-    for (x = 0; x < width; ++x) {                                            \
-      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                      \
-      dst_y += 1;                                                            \
-    }                                                                        \
-  }                                                                          \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
-    int x;                                                                   \
-    for (x = 0; x < width - 1; x += 2) {                                     \
-      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +         \
-                     src_rgb1[B + BPP] + 1) >>                               \
-                    1;                                                       \
-      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +         \
-                     src_rgb1[G + BPP] + 1) >>                               \
-                    1;                                                       \
-      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +         \
-                     src_rgb1[R + BPP] + 1) >>                               \
-                    1;                                                       \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
-      src_rgb0 += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                   \
-      dst_u += 1;                                                            \
-      dst_v += 1;                                                            \
-    }                                                                        \
-    if (width & 1) {                                                         \
-      uint16_t ab = src_rgb0[B] + src_rgb1[B];                               \
-      uint16_t ag = src_rgb0[G] + src_rgb1[G];                               \
-      uint16_t ar = src_rgb0[R] + src_rgb1[R];                               \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
-    }                                                                        \
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                             \
+                    1;                                                     \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
+      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
+      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+    }                                                                      \
   }
 #endif
 
@@ -603,80 +693,80 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
 // ARGBToYJ_C and ARGBToUVJ_C
 // Intel version mimic SSE/AVX which does 2 pavgb
 #if LIBYUV_ARGBTOUV_PAVGB
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                    \
-    for (x = 0; x < width; ++x) {                                             \
-      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                       \
-      dst_y += 1;                                                             \
-    }                                                                         \
-  }                                                                           \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
-    int x;                                                                    \
-    for (x = 0; x < width - 1; x += 2) {                                      \
-      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
-                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
-      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
-                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
-      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
-                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
-      src_rgb0 += BPP * 2;                                                    \
-      src_rgb1 += BPP * 2;                                                    \
-      dst_u += 1;                                                             \
-      dst_v += 1;                                                             \
-    }                                                                         \
-    if (width & 1) {                                                          \
-      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
-      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
-      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
-    }                                                                         \
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+    }                                                                       \
   }
 #else
 // ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                    \
-    for (x = 0; x < width; ++x) {                                             \
-      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                       \
-      dst_y += 1;                                                             \
-    }                                                                         \
-  }                                                                           \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
-    int x;                                                                    \
-    for (x = 0; x < width - 1; x += 2) {                                      \
-      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
-                     src_rgb1[B + BPP] + 1) >>                                \
-                    1;                                                        \
-      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
-                     src_rgb1[G + BPP] + 1) >>                                \
-                    1;                                                        \
-      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
-                     src_rgb1[R + BPP] + 1) >>                                \
-                    1;                                                        \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
-      src_rgb0 += BPP * 2;                                                    \
-      src_rgb1 += BPP * 2;                                                    \
-      dst_u += 1;                                                             \
-      dst_v += 1;                                                             \
-    }                                                                         \
-    if (width & 1) {                                                          \
-      uint16_t ab = (src_rgb0[B] + src_rgb1[B]);                              \
-      uint16_t ag = (src_rgb0[G] + src_rgb1[G]);                              \
-      uint16_t ar = (src_rgb0[R] + src_rgb1[R]);                              \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
-    }                                                                         \
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
+                     src_rgb1[B + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
+                     src_rgb1[G + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
+                     src_rgb1[R + BPP] + 1) >>                              \
+                    1;                                                      \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
+      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
+      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+    }                                                                       \
   }
 
 #endif
@@ -1146,16 +1236,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb,
 #define REPEAT8(v) (v) | ((v) << 8)
 #define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32_t b = REPEAT8(src_argb0[0]);
-    const uint32_t g = REPEAT8(src_argb0[1]);
-    const uint32_t r = REPEAT8(src_argb0[2]);
-    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
     const uint32_t b_scale = src_argb1[0];
     const uint32_t g_scale = src_argb1[1];
     const uint32_t r_scale = src_argb1[2];
@@ -1164,7 +1254,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
     dst_argb[3] = SHADE(a, a_scale);
-    src_argb0 += 4;
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1174,16 +1264,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
                   const uint8_t* src_argb1,
                   uint8_t* dst_argb,
                   int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
+    const int b = src_argb[0];
+    const int g = src_argb[1];
+    const int r = src_argb[2];
+    const int a = src_argb[3];
     const int b_add = src_argb1[0];
     const int g_add = src_argb1[1];
     const int r_add = src_argb1[2];
@@ -1192,7 +1282,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
     dst_argb[1] = SHADE(g, g_add);
     dst_argb[2] = SHADE(r, r_add);
     dst_argb[3] = SHADE(a, a_add);
-    src_argb0 += 4;
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1201,16 +1291,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
+    const int b = src_argb[0];
+    const int g = src_argb[1];
+    const int r = src_argb[2];
+    const int a = src_argb[3];
     const int b_sub = src_argb1[0];
     const int g_sub = src_argb1[1];
     const int r_sub = src_argb1[2];
@@ -1219,7 +1309,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0,
     dst_argb[1] = SHADE(g, g_sub);
     dst_argb[2] = SHADE(r, r_sub);
     dst_argb[3] = SHADE(a, a_sub);
-    src_argb0 += 4;
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1329,64 +1419,36 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 
 // Macros to create SIMD specific yuv to rgb conversion constants.
 
-#if defined(__aarch64__)
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)        \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {       \
-      {UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \
-      {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \
-      {BB, BG, BR, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};         \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {       \
-      {VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \
-      {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \
-      {BR, BG, BB, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};
-
-#elif defined(__arm__)
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)  \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
-      {UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {BB, BG, BR, YB, 0, 0, 0, 0},                                 \
-      {0x0101 * YG, YG, 0, 0}};                                     \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
-      {VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},     \
-      {BR, BG, BB, YB, 0, 0, 0, 0},                                 \
-      {0x0101 * YG, YG, 0, 0}};
+// clang-format off
 
+#if defined(__aarch64__) || defined(__arm__)
+// Bias values include subtract 128 from U and V, bias from Y and rounding.
+// For B and R bias is negative. For G bias is positive.
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
+  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
+   {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
+    0, 0}}
 #else
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)       \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {      \
-      {-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0,   \
-       -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0},  \
-      {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,   \
-       UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},  \
-      {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR,   \
-       0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR},  \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
-      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {      \
-      {-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0,   \
-       -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0},  \
-      {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,   \
-       VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},  \
-      {0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB,   \
-       0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB},  \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
-      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}};
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
+  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
+   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
+   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
 #endif
 
-// TODO(fbarchard): Generate SIMD structures from float matrix.
+// clang-format on
 
-// Bias values to round, and subtract 128 from U and V.
-#define BB (-UB * 128 + YB)
-#define BG (UG * 128 + VG * 128 + YB)
-#define BR (-VR * 128 + YB)
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
+
+// TODO(fbarchard): Generate SIMD structures from float matrix.
 
 // BT.601 limited range YUV to RGB reference
 //  R = (Y - 16) * 1.164             + V * 1.596
@@ -1395,7 +1457,11 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 // KR = 0.299; KB = 0.114
 
 // U and V contributions to R,G,B.
+#ifdef LIBYUV_UNLIMITED_DATA
+#define UB 129 /* round(2.018 * 64) */
+#else
 #define UB 128 /* max(128, round(2.018 * 64)) */
+#endif
 #define UG 25  /* round(0.391 * 64) */
 #define VG 52  /* round(0.813 * 64) */
 #define VR 102 /* round(1.596 * 64) */
@@ -1404,7 +1470,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1429,7 +1495,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1444,9 +1510,12 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 //  B = (Y - 16) * 1.164 + U * 2.112
 //  KR = 0.2126, KB = 0.0722
 
-// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
+#ifdef LIBYUV_UNLIMITED_DATA
+#define UB 135 /* round(2.112 * 64) */
+#else
 #define UB 128 /* max(128, round(2.112 * 64)) */
+#endif
 #define UG 14  /* round(0.213 * 64) */
 #define VG 34  /* round(0.533 * 64) */
 #define VR 115 /* round(1.793 * 64) */
@@ -1455,7 +1524,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1480,7 +1549,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1495,9 +1564,12 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 //  B = (Y - 16) * 1.164384 + U * 2.14177
 // KR = 0.2627; KB = 0.0593
 
-// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
 // U and V contributions to R,G,B.
+#ifdef LIBYUV_UNLIMITED_DATA
+#define UB 137 /* round(2.142 * 64) */
+#else
 #define UB 128 /* max(128, round(2.142 * 64)) */
+#endif
 #define UG 12  /* round(0.187326 * 64) */
 #define VG 42  /* round(0.65042 * 64) */
 #define VR 107 /* round(1.67867 * 64) */
@@ -1506,7 +1578,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1530,7 +1602,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
 
 #undef YG
 #undef YB
@@ -1545,6 +1617,42 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
 #undef MAKEYUVCONSTANTS
 
+#if defined(__aarch64__) || defined(__arm__)
+#define LOAD_YUV_CONSTANTS                 \
+  int ub = yuvconstants->kUVCoeff[0];      \
+  int vr = yuvconstants->kUVCoeff[1];      \
+  int ug = yuvconstants->kUVCoeff[2];      \
+  int vg = yuvconstants->kUVCoeff[3];      \
+  int yg = yuvconstants->kRGBCoeffBias[0]; \
+  int bb = yuvconstants->kRGBCoeffBias[1]; \
+  int bg = yuvconstants->kRGBCoeffBias[2]; \
+  int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16                         \
+  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+  int b16 = y1 + (u * ub) - bb;            \
+  int g16 = y1 + bg - (u * ug + v * vg);   \
+  int r16 = y1 + (v * vr) - br
+#else
+#define LOAD_YUV_CONSTANTS           \
+  int ub = yuvconstants->kUVToB[0];  \
+  int ug = yuvconstants->kUVToG[0];  \
+  int vg = yuvconstants->kUVToG[1];  \
+  int vr = yuvconstants->kUVToR[1];  \
+  int yg = yuvconstants->kYToRgb[0]; \
+  int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16                                \
+  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+  int8_t ui = u;                                  \
+  int8_t vi = v;                                  \
+  ui -= 0x80;                                     \
+  vi -= 0x80;                                     \
+  int b16 = y1 + (ui * ub);                       \
+  int g16 = y1 - (ui * ug + vi * vg);             \
+  int r16 = y1 + (vi * vr)
+#endif
+
 // C reference code that mimics the YUV assembly.
 // Reads 8 bit YUV and leaves result as 16 bit.
 static __inline void YuvPixel(uint8_t y,
@@ -1554,39 +1662,12 @@ static __inline void YuvPixel(uint8_t y,
                               uint8_t* g,
                               uint8_t* r,
                               const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6);
-  *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6);
-  *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6);
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = Clamp((int32_t)(b16) >> 6);
+  *g = Clamp((int32_t)(g16) >> 6);
+  *r = Clamp((int32_t)(r16) >> 6);
 }
 
 // Reads 8 bit YUV and leaves result as 16 bit.
@@ -1597,85 +1678,50 @@ static __inline void YuvPixel8_16(uint8_t y,
                                   int* g,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
 // Reads 10 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel16(int16_t y,
-                                int16_t u,
-                                int16_t v,
-                                int* b,
-                                int* g,
-                                int* r,
-                                const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+static __inline void YuvPixel10_16(uint16_t y,
+                                   uint16_t u,
+                                   uint16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y << 6;
   u = clamp255(u >> 2);
   v = clamp255(v >> 2);
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 12 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel12_16(int16_t y,
+                                   int16_t u,
+                                   int16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y << 4;
+  u = clamp255(u >> 4);
+  v = clamp255(v >> 4);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }
 
 // C reference code that mimics the YUV 10 bit assembly.
@@ -1690,22 +1736,78 @@ static __inline void YuvPixel10(uint16_t y,
   int b16;
   int g16;
   int r16;
-  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = Clamp(b16 >> 6);
+  *g = Clamp(g16 >> 6);
+  *r = Clamp(r16 >> 6);
+}
+
+// C reference code that mimics the YUV 12 bit assembly.
+// Reads 12 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel12(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
   *b = Clamp(b16 >> 6);
   *g = Clamp(g16 >> 6);
   *r = Clamp(r16 >> 6);
 }
 
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 8 bit.
+static __inline void YuvPixel16_8(uint16_t y,
+                                  uint16_t u,
+                                  uint16_t v,
+                                  uint8_t* b,
+                                  uint8_t* g,
+                                  uint8_t* r,
+                                  const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y;
+  u = clamp255(u >> 8);
+  v = clamp255(v >> 8);
+  CALC_RGB16;
+  *b = Clamp((int32_t)(b16) >> 6);
+  *g = Clamp((int32_t)(g16) >> 6);
+  *r = Clamp((int32_t)(r16) >> 6);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16_16(uint16_t y,
+                                   uint16_t u,
+                                   uint16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y;
+  u = clamp255(u >> 8);
+  v = clamp255(v >> 8);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
+}
+
 // C reference code that mimics the YUV assembly.
-// Reads 8 bit YUV and leaves result as 16 bit.
+// Reads 8 bit YUV and leaves result as 8 bit.
 static __inline void YPixel(uint8_t y,
                             uint8_t* b,
                             uint8_t* g,
                             uint8_t* r,
                             const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__) || defined(__arm__)
-  int ygb = yuvconstants->kUVBiasBGR[3];
-  int yg = yuvconstants->kYToRgb[1];
+  int yg = yuvconstants->kRGBCoeffBias[0];
+  int ygb = yuvconstants->kRGBCoeffBias[4];
 #else
   int ygb = yuvconstants->kYBiasToRgb[0];
   int yg = yuvconstants->kYToRgb[0];
@@ -1716,38 +1818,6 @@ static __inline void YPixel(uint8_t y,
   *r = Clamp(((int32_t)(y1) + ygb) >> 6);
 }
 
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-             yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-             yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-#else
 void I444ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -1765,7 +1835,6 @@ void I444ToARGBRow_C(const uint8_t* src_y,
     rgb_buf += 4;  // Advance 1 pixel.
   }
 }
-#endif
 
 // Also used for 420
 void I422ToARGBRow_C(const uint8_t* src_y,
@@ -1821,9 +1890,102 @@ void I210ToARGBRow_C(const uint16_t* src_y,
   }
 }
 
+void I410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixels.
+  }
+}
+
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = clamp255(src_a[1] >> 2);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = clamp255(src_a[0] >> 2);
+  }
+}
+
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = clamp255(src_a[0] >> 2);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    src_a += 1;
+    rgb_buf += 4;  // Advance 1 pixels.
+  }
+}
+
+// 12 bit YUV to ARGB
+void I212ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
   uint32_t ar30;
-  b = b >> 4;  // convert 10.6 to 10 bit.
+  b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
   g = g >> 4;
   r = r >> 4;
   b = Clamp10(b);
@@ -1845,9 +2007,9 @@ void I210ToAR30Row_C(const uint16_t* src_y,
   int g;
   int r;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
-    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf + 4, b, g, r);
     src_y += 2;
     src_u += 1;
@@ -1855,16 +2017,15 @@ void I210ToAR30Row_C(const uint16_t* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
   }
 }
 
-// 8 bit YUV to 10 bit AR30
-// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
-void I422ToAR30Row_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
+// 12 bit YUV to 10 bit AR30
+void I212ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
@@ -1873,9 +2034,9 @@ void I422ToAR30Row_C(const uint8_t* src_y,
   int g;
   int r;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
-    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf + 4, b, g, r);
     src_y += 2;
     src_u += 1;
@@ -1883,45 +2044,142 @@ void I422ToAR30Row_C(const uint8_t* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
   }
 }
 
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444AlphaToARGBRow_C(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          const uint8_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+void I410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+// P210 has 10 bits in msb of 16 bit NV12 style layout.
+void P210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-             yuvconstants);
-    rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-             yuvconstants);
-    rgb_buf[7] = src_a[1];
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+    YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
+                 dst_argb + 6, yuvconstants);
+    dst_argb[7] = 255;
     src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    src_a += 2;
+    src_uv += 2;
+    dst_argb += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+  }
+}
+
+void P410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+    src_y += 1;
+    src_uv += 2;
+    dst_argb += 4;  // Advance 1 pixels.
+  }
+}
+
+void P210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+    YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30 + 4, b, g, r);
+    src_y += 2;
+    src_uv += 2;
+    dst_ar30 += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+  }
+}
+
+void P410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width; ++x) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+    src_y += 1;
+    src_uv += 2;
+    dst_ar30 += 4;  // Advance 1 pixel.
+  }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
   }
 }
-#else
+
 void I444AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -1941,7 +2199,6 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
     rgb_buf += 4;  // Advance 1 pixel.
   }
 }
-#endif
 
 void I422AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
@@ -2492,6 +2749,105 @@ void MergeARGBRow_C(const uint8_t* src_r,
   }
 }
 
+void MergeXR30Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint8_t* dst_ar30,
+                    int depth,
+                    int width) {
+  assert(depth >= 10);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 10;
+  uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
+  for (x = 0; x < width; ++x) {
+    uint32_t r = clamp1023(src_r[x] >> shift);
+    uint32_t g = clamp1023(src_g[x] >> shift);
+    uint32_t b = clamp1023(src_b[x] >> shift);
+    dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
+  }
+}
+
+void MergeAR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    const uint16_t* src_a,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width) {
+  assert(depth >= 1);
+  assert(depth <= 16);
+  int x;
+  int shift = 16 - depth;
+  int max = (1 << depth) - 1;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[3] = ClampMax(src_a[x], max) << shift;
+    dst_ar64 += 4;
+  }
+}
+
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         const uint16_t* src_a,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width) {
+  assert(depth >= 8);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 8;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = clamp255(src_b[x] >> shift);
+    dst_argb[1] = clamp255(src_g[x] >> shift);
+    dst_argb[2] = clamp255(src_r[x] >> shift);
+    dst_argb[3] = clamp255(src_a[x] >> shift);
+    dst_argb += 4;
+  }
+}
+
+void MergeXR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width) {
+  assert(depth >= 1);
+  assert(depth <= 16);
+  int x;
+  int shift = 16 - depth;
+  int max = (1 << depth) - 1;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[3] = 0xffff;
+    dst_ar64 += 4;
+  }
+}
+
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width) {
+  assert(depth >= 8);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 8;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = clamp255(src_b[x] >> shift);
+    dst_argb[1] = clamp255(src_g[x] >> shift);
+    dst_argb[2] = clamp255(src_r[x] >> shift);
+    dst_argb[3] = 0xff;
+    dst_argb += 4;
+  }
+}
+
 void SplitXRGBRow_C(const uint8_t* src_argb,
                     uint8_t* dst_r,
                     uint8_t* dst_g,
@@ -2528,6 +2884,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
                      int depth,
                      int width) {
   int shift = 16 - depth;
+  assert(depth >= 8);
+  assert(depth <= 16);
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[0] = src_u[x] << shift;
@@ -2544,6 +2902,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv,
                      int width) {
   int shift = 16 - depth;
   int x;
+  assert(depth >= 8);
+  assert(depth <= 16);
   for (x = 0; x < width; ++x) {
     dst_u[x] = src_uv[0] >> shift;
     dst_v[x] = src_uv[1] >> shift;
@@ -2581,6 +2941,9 @@ void Convert16To8Row_C(const uint16_t* src_y,
                        int scale,
                        int width) {
   int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+
   for (x = 0; x < width; ++x) {
     dst_y[x] = clamp255((src_y[x] * scale) >> 16);
   }
@@ -2714,19 +3077,19 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
 
 #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
 
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_C(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint32_t fb = src_argb0[0];
-    uint32_t fg = src_argb0[1];
-    uint32_t fr = src_argb0[2];
-    uint32_t a = src_argb0[3];
+    uint32_t fb = src_argb[0];
+    uint32_t fg = src_argb[1];
+    uint32_t fr = src_argb[2];
+    uint32_t a = src_argb[3];
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
@@ -2735,10 +3098,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
     dst_argb[2] = BLEND(fr, br, a);
     dst_argb[3] = 255u;
 
-    fb = src_argb0[4 + 0];
-    fg = src_argb0[4 + 1];
-    fr = src_argb0[4 + 2];
-    a = src_argb0[4 + 3];
+    fb = src_argb[4 + 0];
+    fg = src_argb[4 + 1];
+    fr = src_argb[4 + 2];
+    a = src_argb[4 + 3];
     bb = src_argb1[4 + 0];
     bg = src_argb1[4 + 1];
     br = src_argb1[4 + 2];
@@ -2746,16 +3109,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
     dst_argb[4 + 1] = BLEND(fg, bg, a);
     dst_argb[4 + 2] = BLEND(fr, br, a);
     dst_argb[4 + 3] = 255u;
-    src_argb0 += 8;
+    src_argb += 8;
     src_argb1 += 8;
     dst_argb += 8;
   }
 
   if (width & 1) {
-    uint32_t fb = src_argb0[0];
-    uint32_t fg = src_argb0[1];
-    uint32_t fr = src_argb0[2];
-    uint32_t a = src_argb0[3];
+    uint32_t fb = src_argb[0];
+    uint32_t fg = src_argb[1];
+    uint32_t fr = src_argb[2];
+    uint32_t a = src_argb[3];
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
@@ -3280,7 +3643,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
 // Maximum temporary width for wrappers to process at a time, in pixels.
 #define MAXTWIDTH 2048
 
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
     defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
 void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
@@ -3747,13 +4110,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
 }
 
 // Filter 2 rows of AYUV UV's (444) into UV (420).
+// AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
 void AYUVToUVRow_C(const uint8_t* src_ayuv,
                    int src_stride_ayuv,
                    uint8_t* dst_uv,
                    int width) {
   // Output a row of UV values, filtering 2x2 rows of AYUV.
   int x;
-  for (x = 0; x < width; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
                  src_ayuv[src_stride_ayuv + 5] + 2) >>
                 2;
@@ -3764,12 +4128,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv,
     dst_uv += 2;
   }
   if (width & 1) {
-    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 0] + 2) >>
-                2;
-    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 1] + 2) >>
-                2;
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
   }
 }
 
@@ -3780,7 +4140,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
                    int width) {
   // Output a row of VU values, filtering 2x2 rows of AYUV.
   int x;
-  for (x = 0; x < width; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
                  src_ayuv[src_stride_ayuv + 4] + 2) >>
                 2;
@@ -3791,12 +4151,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
     dst_vu += 2;
   }
   if (width & 1) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 0] + 2) >>
-                2;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 1] + 2) >>
-                2;
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
   }
 }
 
diff --git a/third_party/libyuv/source/row_gcc.cc b/third_party/libyuv/source/row_gcc.cc
index faf0fc9104..001c353dbe 100644
--- a/third_party/libyuv/source/row_gcc.cc
+++ b/third_party/libyuv/source/row_gcc.cc
@@ -16,8 +16,7 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
@@ -1078,6 +1077,222 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif
 
+static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
+                                         10, 9, 8, 11, 14, 13, 12, 15};
+
+static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
+                                           6, 6, 5, 5, 4, 4, 7, 7};
+static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
+                                           14, 14, 13, 13, 12, 12, 15, 15};
+
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ar64,
+                         int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ab64,
+                         int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm2                     \n"
+      "movdqa      %4,%%xmm3                     \n" LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm0                 \n"
+      "pshufb      %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),             // %0
+        "+r"(dst_ab64),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleARGBToAB64Lo),  // %3
+        "m"(kShuffleARGBToAB64Hi)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrlw       $8,%%xmm0                     \n"
+      "psrlw       $8,%%xmm1                     \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm2                     \n" LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrlw       $8,%%xmm0                     \n"
+      "psrlw       $8,%%xmm1                     \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "pshufb      %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm2                  \n"
+      "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
+      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),             // %0
+        "+r"(dst_ab64),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleARGBToAB64Lo),  // %3
+        "m"(kShuffleARGBToAB64Hi)   // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
+      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x40(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_AVX2
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
+      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x40(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
 // clang-format off
 
 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
@@ -1290,7 +1505,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
 #endif  // HAS_RGBATOYJROW_AVX2
 
 #ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1342,7 +1557,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_argb0),                   // %0
+      : "+r"(src_argb),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1359,7 +1574,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
 static const lvec8 kShufARGBToUV_AVX = {
     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
@@ -1407,7 +1622,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
       "sub         $0x20,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb0),                   // %0
+      : "+r"(src_argb),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1422,7 +1637,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ABGRTOUVROW_AVX2
-void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
                       int src_stride_abgr,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
@@ -1470,7 +1685,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
       "sub         $0x20,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_abgr0),                   // %0
+      : "+r"(src_abgr),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1485,7 +1700,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
 #endif  // HAS_ABGRTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1534,7 +1749,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
       "sub         $0x20,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb0),                   // %0
+      : "+r"(src_argb),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1549,7 +1764,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@@ -1602,7 +1817,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_argb0),                   // %0
+      : "+r"(src_argb),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1689,7 +1904,7 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
         "xmm7");
 }
 
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        int src_stride_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1741,7 +1956,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_bgra0),                   // %0
+      : "+r"(src_bgra),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1786,7 +2001,7 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
         "xmm7");
 }
 
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        int src_stride_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1838,7 +2053,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_abgr0),                   // %0
+      : "+r"(src_abgr),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1849,7 +2064,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        int src_stride_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1901,7 +2116,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_rgba0),                   // %0
+      : "+r"(src_rgba),                    // %0
         "+r"(dst_u),                       // %1
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
@@ -1916,21 +2131,21 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
 
 // Read 8 UV from 444
 #define READYUV444                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                \
-  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       (%[u_buf]),%%xmm3                               \n" \
   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
@@ -1940,24 +2155,87 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
 // TODO(fbarchard): Consider pmulhuw to replace psraw
 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
 #define READYUV210                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
-  "psraw      $0x2,%%xmm0                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+#define READYUVA210                                               \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
+  "movdqu     (%[a_buf]),%%xmm5                               \n" \
+  "psraw      $2,%%xmm5                                       \n" \
+  "packuswb   %%xmm5,%%xmm5                                   \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
+
+// Read 8 UV from 444 10 bit
+#define READYUV410                                                \
+  "movdqu     (%[u_buf]),%%xmm3                               \n" \
+  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "psraw      $2,%%xmm2                                       \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 8 UV from 444 10 bit.  With 8 Alpha.
+#define READYUVA410                                               \
+  "movdqu     (%[u_buf]),%%xmm3                               \n" \
+  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "psraw      $2,%%xmm2                                       \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
   "movdqu     (%[y_buf]),%%xmm4                               \n" \
   "psllw      $0x6,%%xmm4                                     \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
+  "movdqu     (%[a_buf]),%%xmm5                               \n" \
+  "psraw      $2,%%xmm5                                       \n" \
+  "packuswb   %%xmm5,%%xmm5                                   \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
+
+// Read 4 UV from 422 12 bit, upsample to 8 UV
+#define READYUV212                                                \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $0x4,%%xmm3                                     \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $0x4,%%xmm4                                     \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422                                               \
-  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       (%[u_buf]),%%xmm3                               \n" \
   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
@@ -1966,10 +2244,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
 
 // Read 8 UV from 444.  With 8 Alpha.
 #define READYUVA444                                               \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
@@ -1978,18 +2256,18 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
 
 // Read 4 UV from NV12, upsample to 8 UV
 #define READNV12                                                  \
-  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "movq       (%[uv_buf]),%%xmm3                              \n" \
   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 VU from NV21, upsample to 8 UV
 #define READNV21                                                  \
-  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "movq       (%[vu_buf]),%%xmm3                              \n" \
   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
-  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
   "movq       (%[y_buf]),%%xmm4                               \n" \
   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
@@ -1998,68 +2276,92 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
 #define READYUY2                                                  \
   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
-  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
-  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
 
 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
 #define READUYVY                                                  \
   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
-  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
-  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
 
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210                                                  \
+  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
+  "psrlw      $0x8,%%xmm3                                     \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 8 UV from P410
+#define READP410                                                  \
+  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
+  "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
+  "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
+  "psrlw      $0x8,%%xmm3                                     \n" \
+  "psrlw      $0x8,%%xmm1                                     \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
 #if defined(__x86_64__)
 #define YUVTORGB_SETUP(yuvconstants)                              \
+  "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
   "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "pxor       %%xmm12,%%xmm12                                 \n" \
   "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "psllw      $7,%%xmm13                                      \n" \
   "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "pshufb     %%xmm12,%%xmm13                                 \n" \
   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
-  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
+
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "movdqa     %%xmm0,%%xmm2                                   \n" \
-  "movdqa     %%xmm0,%%xmm3                                   \n" \
-  "movdqa     %%xmm11,%%xmm0                                  \n" \
-  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
-  "psubw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     %%xmm12,%%xmm1                                  \n" \
-  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
-  "psubw      %%xmm2,%%xmm1                                   \n" \
-  "movdqa     %%xmm13,%%xmm2                                  \n" \
-  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
-  "psubw      %%xmm3,%%xmm2                                   \n" \
-  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
+  "psubb      %%xmm13,%%xmm3                                  \n" \
+  "pmulhuw    %%xmm11,%%xmm4                                  \n" \
+  "movdqa     %%xmm8,%%xmm0                                   \n" \
+  "movdqa     %%xmm9,%%xmm1                                   \n" \
+  "movdqa     %%xmm10,%%xmm2                                  \n" \
+  "paddw      %%xmm12,%%xmm4                                  \n" \
+  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
   "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n"
-#define YUVTORGB_REGS \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psubsw     %%xmm1,%%xmm4                                   \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n"
+
+#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
 
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "movdqa     %%xmm0,%%xmm2                                   \n" \
-  "movdqa     %%xmm0,%%xmm3                                   \n" \
-  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
-  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
-  "psubw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
-  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
-  "psubw      %%xmm2,%%xmm1                                   \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
-  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
-  "psubw      %%xmm3,%%xmm2                                   \n" \
-  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
+  "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
+  "pxor       %%xmm1,%%xmm1                                   \n" \
+  "psllw      $7,%%xmm0                                       \n" \
+  "pshufb     %%xmm1,%%xmm0                                   \n" \
+  "psubb      %%xmm0,%%xmm3                                   \n" \
+  "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
+  "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
+  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
+  "paddw      %%xmm3,%%xmm4                                   \n" \
   "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n"
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psubsw     %%xmm1,%%xmm4                                   \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n"
+
 #define YUVTORGB_REGS
 #endif
 
@@ -2275,8 +2577,8 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
 
     LABELALIGN
@@ -2327,6 +2629,36 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
   );
 }
 
+// 12 bit YUV to ARGB
+void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
 // 10 bit YUV to AR30
 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 const uint16_t* u_buf,
@@ -2340,8 +2672,8 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
 
     LABELALIGN
@@ -2362,6 +2694,176 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
   );
 }
 
+// 12 bit YUV to AR30
+void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                                     const uint16_t* u_buf,
+                                     const uint16_t* v_buf,
+                                     const uint16_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  asm volatile(
+      YUVTORGB_SETUP(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+
+      LABELALIGN "1:                                        \n" READYUVA210
+          YUVTORGB(yuvconstants) STOREARGB
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+      : [y_buf] "+r"(y_buf),  // %[y_buf]
+        [u_buf] "+r"(u_buf),  // %[u_buf]
+        [v_buf] "+r"(v_buf),  // %[v_buf]
+        [a_buf] "+r"(a_buf),
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+        [width] "+m"(width)  // %[width]
+#else
+        [width] "+rm"(width)  // %[width]
+#endif
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
+}
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                                     const uint16_t* u_buf,
+                                     const uint16_t* v_buf,
+                                     const uint16_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile(
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUVA410
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+    : [y_buf] "+r"(y_buf),  // %[y_buf]
+      [u_buf] "+r"(u_buf),  // %[u_buf]
+      [v_buf] "+r"(v_buf),  // %[v_buf]
+      [a_buf] "+r"(a_buf),
+      [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+      [width] "+m"(width)  // %[width]
+#else
+      [width] "+rm"(width)  // %[width]
+#endif
+    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "xmm5");
+  // clang-format on
+}
+#endif
+
+// 10 bit YUV to AR30
+void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      const uint8_t* u_buf,
@@ -2513,6 +3015,112 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
   // clang-format on
 }
 
+void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile(
+      YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN "1:                                        \n" READP210
+          YUVTORGB(yuvconstants) STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[u_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
+}
+
+void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile(
+      YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN "1:                                        \n" READP410
+          YUVTORGB(yuvconstants) STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[u_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
+}
+
+void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READP210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),              // %[y_buf]
+    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
+    [width]"+rm"(width)              // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READP410
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),              // %[y_buf]
+    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
+    [width]"+rm"(width)              // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -2546,12 +3154,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 
 // Read 16 UV from 444
 #define READYUV444_AVX2                                               \
-  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -2559,42 +3167,108 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 
 // Read 8 UV from 422, upsample to 16 UV.
 #define READYUV422_AVX2                                               \
-  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
-// Read 8 UV from 210 10 bit, upsample to 16 UV
+// Read 8 UV from 210, upsample to 16 UV
 // TODO(fbarchard): Consider vshufb to replace pack/unpack
 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
 #define READYUV210_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
+#define READYUVA210_AVX2                                           \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
+  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
+  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
+  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
+  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
+
+// Read 16 UV from 410
+#define READYUV410_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
+  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
+  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 8 UV from 212 12 bit, upsample to 16 UV
+#define READYUV212_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
-  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
-  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
+  "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
+// Read 16 UV from 410. With 16 Alpha.
+#define READYUVA410_AVX2                                           \
+  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
+  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
+  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
+  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
+  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
+  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
+  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
+
 // Read 16 UV from 444.  With 16 Alpha.
 #define READYUVA444_AVX2                                              \
-  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -2605,12 +3279,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
 #define READYUVA422_AVX2                                              \
-  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -2621,10 +3295,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 
 // Read 8 UV from NV12, upsample to 16 UV.
 #define READNV12_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
@@ -2632,73 +3306,98 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 
 // Read 8 VU from NV21, upsample to 16 UV.
 #define READNV21_AVX2                                                 \
-  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
+  "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from P410
+#define READP410_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
+  "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
+  "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
+  "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
+
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
 #define READYUY2_AVX2                                                 \
   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
-  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
 #define READUYVY_AVX2                                                 \
   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
-  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
-  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
-  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
-  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
-  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
-  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
-  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
-  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
+  "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
+  "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
+  "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
 
 #define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
-  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
-  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
-  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
-  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
-  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
-  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
+  "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
+  "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
+  "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
+  "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
 
-#define YUVTORGB_REGS_AVX2 \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
 
 #else  // Convert 16 pixels: 16 UV and 16 Y.
 
 #define YUVTORGB_SETUP_AVX2(yuvconstants)
 #define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
-  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
-  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
-  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
-  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
-  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
-  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
-  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
-  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
-  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
+  "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
+  "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
+  "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
+  "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
+  "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
+  "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
+  "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
 #define YUVTORGB_REGS_AVX2
 #endif
 
@@ -2721,7 +3420,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
   "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
   "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
   "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
-  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
+  "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
 
 // Store 16 AR30 values.
 #define STOREAR30_AVX2                                                \
@@ -2894,6 +3593,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
 }
 #endif  // HAS_I210TOARGBROW_AVX2
 
+#if defined(HAS_I212TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I212TOARGBROW_AVX2
+
 #if defined(HAS_I210TOAR30ROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
@@ -2929,11 +3663,198 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_I210TOAR30ROW_AVX2
 
+#if defined(HAS_I212TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I212TOAR30ROW_AVX2
+
+#if defined(HAS_I410TOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                                    const uint16_t* u_buf,
+                                    const uint16_t* v_buf,
+                                    const uint16_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+
+      LABELALIGN "1:                                        \n" READYUVA210_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+        [width] "+m"(width)  // %[width]
+#else
+        [width] "+rm"(width)  // %[width]
+#endif
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5");
+}
+#endif  // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                                    const uint16_t* u_buf,
+                                    const uint16_t* v_buf,
+                                    const uint16_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+
+      LABELALIGN "1:                                        \n" READYUVA410_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+        [width] "+m"(width)  // %[width]
+#else
+        [width] "+rm"(width)  // %[width]
+#endif
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5");
+}
+#endif  // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I410TOAR30ROW_AVX2
+
 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
 // 16 pixels
 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
@@ -3193,14 +4114,154 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
 }
 #endif  // HAS_UYVYTOARGBROW_AVX2
 
+#if defined(HAS_P210TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP210_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_P210TOARGBROW_AVX2
+
+#if defined(HAS_P410TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP410_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_P410TOARGBROW_AVX2
+
+#if defined(HAS_P210TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_P210TOAR30ROW_AVX2
+
+#if defined(HAS_P410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP410_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_P410TOAR30ROW_AVX2
+
 #ifdef HAS_I400TOARGBROW_SSE2
 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile(
-      "movdqa      192(%3),%%xmm2                \n"  // yg = 18997 = 1.164
-      "movdqa      224(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
+      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
+      "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm4                  \n"
 
@@ -3244,8 +4305,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile(
-      "vmovdqa     192(%3),%%ymm2                \n"  // yg = 18997 = 1.164
-      "vmovdqa     224(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
+      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
+      "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
 
@@ -3663,8 +4724,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
   // clang-format off
   asm volatile (
       "vmovd       %4,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%xmm3                \n"
       "sub         %0,%1                         \n"
 
     // 16 pixels per loop.
@@ -3696,7 +4755,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
-#ifdef HAS_MERGEUVROW_16_AVX2
+#ifdef HAS_SPLITUVROW_16_AVX2
 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
                                  2, 3, 6, 7, 10, 11, 14, 15};
 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
@@ -3707,44 +4766,41 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
   depth = 16 - depth;
   // clang-format off
   asm volatile (
-    "vmovd       %4,%%xmm3                     \n"
-    "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-    "vbroadcastss %%xmm3,%%xmm3                \n"
-    "vbroadcastf128 %5,%%ymm4                  \n"
-    "sub         %1,%2                         \n"
+      "vmovd       %4,%%xmm3                     \n"
+      "vbroadcastf128 %5,%%ymm4                  \n"
+      "sub         %1,%2                         \n"
 
     // 16 pixels per loop.
     LABELALIGN
-    "1:                                        \n"
-    "vmovdqu     (%0),%%ymm0                   \n"
-    "vmovdqu     0x20(%0),%%ymm1               \n"
-    "add         $0x40,%0                      \n"
-
-    "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
-    "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
-    "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-    "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-    "vextractf128 $0x0,%%ymm0,(%1)             \n"
-    "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
-    "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
-    "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
-    "add         $0x20,%1                      \n"
-    "sub         $0x10,%3                      \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "add         $0x40,%0                      \n"
+
+      "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
+      "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
+      "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : "+r"(src_uv),   // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),  // %2
-    "+r"(width),    // %3
-    "+r"(depth)     // %4
-  :
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(width)     // %3
+  : "r"(depth),     // %4
     "m"(kSplitUVShuffle16) // %5
   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
   // clang-format on
 }
-#endif  // HAS_MERGEUVROW_AVX2
+#endif  // HAS_SPLITUVROW_16_AVX2
 
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 128 = 9 bits
@@ -3797,24 +4853,24 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                        int width) {
   // clang-format off
   asm volatile (
-    "vmovd       %3,%%xmm3                     \n"
-    "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "sub         %0,%1                         \n"
+      "vmovd       %3,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
 
     // 32 pixels per loop.
     LABELALIGN
-    "1:                                        \n"
-    "vmovdqu     (%0),%%ymm0                   \n"
-    "vmovdqu     0x20(%0),%%ymm1               \n"
-    "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
-    "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-    "vmovdqu     %%ymm0,(%0,%1)                \n"
-    "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
-    "add         $0x40,%0                      \n"
-    "sub         $0x20,%2                      \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+      "add         $0x40,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : "+r"(src_y),   // %0
     "+r"(dst_y),   // %1
     "+r"(width),    // %2
@@ -4202,7 +5258,9 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
       :
       : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
+#endif
 
+#ifdef HAS_MERGEXRGBROW_SSE2
 void MergeXRGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_g,
                        const uint8_t* src_b,
@@ -4286,7 +5344,9 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
       :
       : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
+#endif
 
+#ifdef HAS_MERGEXRGBROW_AVX2
 void MergeXRGBRow_AVX2(const uint8_t* src_r,
                        const uint8_t* src_g,
                        const uint8_t* src_b,
@@ -4380,7 +5440,9 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
       :
       : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
+#endif
 
+#ifdef HAS_SPLITXRGBROW_SSE2
 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
@@ -4471,12 +5533,14 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
 #if defined(__i386__)
         "+m"(width)  // %5
 #else
-        "+rm"(width)  // %5
+        "+rm"(width)          // %5
 #endif
       : "m"(kShuffleMaskARGBSplit)  // %6
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
+#endif
 
+#ifdef HAS_SPLITXRGBROW_SSSE3
 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_r,
                         uint8_t* dst_g,
@@ -4562,13 +5626,15 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
 #if defined(__i386__)
         "+m"(width)  // %5
 #else
-        "+rm"(width)  // %5
+        "+rm"(width)          // %5
 #endif
       : "m"(kShuffleMaskARGBSplit),   // %6
         "m"(kShuffleMaskARGBPermute)  // %7
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
+#endif
 
+#ifdef HAS_SPLITXRGBROW_AVX2
 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
@@ -4610,7 +5676,318 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
         "+r"(width)                   // %4
       : "m"(kShuffleMaskARGBSplit),   // %5
         "m"(kShuffleMaskARGBPermute)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width) {
+  int shift = depth - 10;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vmovd       %5,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     (%0,%1),%%ymm1                \n"
+      "vmovdqu     (%0,%2),%%ymm2                \n"
+      "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
+      "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
+      "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
+      "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
+      "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
+      "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
+      "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
+      "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
+      "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
+      "vpslld      $0xa,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
+      "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
+      "vmovdqu     %%ymm0,(%3)                   \n"
+      "vmovdqu     %%ymm3,0x20(%3)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+#if defined(__i386__)
+      : "m"(shift)  // %5
+#else
+      : "rm"(shift)           // %5
+#endif
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  mask = (mask << 16) + mask;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "vmovdqa     %8,%%ymm5                     \n"
+      "vmovd       %6,%%xmm6                     \n"
+      "vbroadcastss %7,%%ymm7                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
+      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
+      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
+      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
+      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
+      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
+      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
+      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
+      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
+      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
+      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
+      "vmovdqu     %%ymm3,(%4)                   \n"
+      "vmovdqu     %%ymm2,0x20(%4)               \n"
+      "vmovdqu     %%ymm4,0x40(%4)               \n"
+      "vmovdqu     %%ymm1,0x60(%4)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x80(%4),%4                   \n"
+      "subl        $0x10,%5                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_ar64),  // %4
+#if defined(__i386__)
+        "+m"(width)  // %5
+#else
+        "+rm"(width)          // %5
+#endif
+      : "m"(shift),            // %6
+        "m"(mask),             // %7
+        "m"(MergeAR64Permute)  // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  mask = (mask << 16) + mask;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "vmovdqa     %7,%%ymm5                     \n"
+      "vmovd       %5,%%xmm6                     \n"
+      "vbroadcastss %6,%%ymm7                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
+      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
+      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
+      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
+      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
+      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
+      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
+      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
+      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
+      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
+      "vmovdqu     %%ymm3,(%3)                   \n"
+      "vmovdqu     %%ymm2,0x20(%3)               \n"
+      "vmovdqu     %%ymm4,0x40(%3)               \n"
+      "vmovdqu     %%ymm1,0x60(%3)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x80(%3),%3                   \n"
+      "subl        $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),           // %0
+        "+r"(src_g),           // %1
+        "+r"(src_b),           // %2
+        "+r"(dst_ar64),        // %3
+        "+r"(width)            // %4
+      : "m"(shift),            // %5
+        "m"(mask),             // %6
+        "m"(MergeAR64Permute)  // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
+                                            4, 12, 5, 13, 6, 14, 7, 15};
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = depth - 8;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "vbroadcastf128 %7,%%ymm5                  \n"
+      "vmovd       %6,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
+      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
+      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
+      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
+      "vmovdqu     %%ymm2,(%4)                   \n"
+      "vmovdqu     %%ymm0,0x20(%4)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%4),%4                   \n"
+      "subl        $0x10,%5                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+#if defined(__i386__)
+        "+m"(width)  // %5
+#else
+        "+rm"(width)          // %5
+#endif
+      : "m"(shift),                 // %6
+        "m"(MergeARGB16To8Shuffle)  // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = depth - 8;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "vbroadcastf128 %6,%%ymm5                  \n"
+      "vmovd       %5,%%xmm6                     \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
+      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
+      "vmovdqu     %%ymm2,(%3)                   \n"
+      "vmovdqu     %%ymm0,0x20(%3)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%3),%3                   \n"
+      "subl        $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),                // %0
+        "+r"(src_g),                // %1
+        "+r"(src_b),                // %2
+        "+r"(dst_argb),             // %3
+        "+r"(width)                 // %4
+      : "m"(shift),                 // %5
+        "m"(MergeARGB16To8Shuffle)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif
 
@@ -5339,7 +6716,7 @@ static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
@@ -5410,7 +6787,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
       "sub         $0x1,%3                       \n"
       "jge         91b                           \n"
       "99:                                       \n"
-      : "+r"(src_argb0),    // %0
+      : "+r"(src_argb),     // %0
         "+r"(src_argb1),    // %1
         "+r"(dst_argb),     // %2
         "+r"(width)         // %3
@@ -6012,7 +7389,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -6040,7 +7417,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
       "lea         0x10(%2),%2                   \n"
       "sub         $0x4,%3                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -6051,7 +7428,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -6078,7 +7455,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
       "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -6089,7 +7466,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
@@ -6106,7 +7483,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
       "lea         0x10(%2),%2                   \n"
       "sub         $0x4,%3                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -6117,7 +7494,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
@@ -6134,7 +7511,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
       "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -6145,7 +7522,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -6162,7 +7539,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
       "lea         0x10(%2),%2                   \n"
       "sub         $0x4,%3                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -6173,7 +7550,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -6190,7 +7567,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
       "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -7279,7 +8656,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)    // %3
+      : "m"(scale)            // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -7317,7 +8694,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)    // %3
+      : "m"(scale)            // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
diff --git a/third_party/libyuv/source/row_mmi.cc b/third_party/libyuv/source/row_mmi.cc
index 9a8e2cb2d1..362fd1cfcc 100644
--- a/third_party/libyuv/source/row_mmi.cc
+++ b/third_party/libyuv/source/row_mmi.cc
@@ -605,7 +605,7 @@ void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
       : "memory");
 }
 
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest0, dest1, dest2, dest3;
   const uint64_t value = 0x1080;
@@ -613,8 +613,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -626,8 +626,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
       "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -639,8 +639,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
       "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -652,8 +652,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
       "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -671,20 +671,20 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
 
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
       "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
       "daddi      %[width],        %[width],         -0x08          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
         [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
         [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
         [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
         [zero] "f"(0x00)
       : "memory");
 }
 
-void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
+void ARGBToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -700,9 +700,9 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "dli        %[tmp0],         0x0001000100010001                   \n\t"
       "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -720,8 +720,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -748,8 +748,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -767,8 +767,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -795,8 +795,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -814,8 +814,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -842,8 +842,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -861,8 +861,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -901,7 +901,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -913,7 +913,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
         [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
@@ -921,7 +921,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
       : "memory");
 }
 
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest0, dest1, dest2, dest3;
   const uint64_t value = 0x1080;
@@ -929,8 +929,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -942,8 +942,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
       "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -955,8 +955,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
       "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -968,8 +968,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
       "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -987,20 +987,20 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
 
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
       "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
       "daddi      %[width],        %[width],         -0x08          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
         [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
         [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
         [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
         [zero] "f"(0x00)
       : "memory");
 }
 
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -1016,9 +1016,9 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "dli        %[tmp0],         0x0001000100010001                   \n\t"
       "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1036,8 +1036,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1064,8 +1064,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1083,8 +1083,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1111,8 +1111,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1130,8 +1130,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1158,8 +1158,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1177,8 +1177,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1217,7 +1217,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -1229,7 +1229,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
         [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
@@ -1237,7 +1237,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
       : "memory");
 }
 
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest0, dest1, dest2, dest3;
   const uint64_t value = 0x1080;
@@ -1245,8 +1245,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1258,8 +1258,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
       "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1271,8 +1271,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
       "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1284,8 +1284,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
       "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1303,20 +1303,20 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
 
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
       "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
       "daddi      %[width],        %[width],         -0x08          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
         [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
         [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
         [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
         [zero] "f"(0x00)
       : "memory");
 }
 
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -1332,9 +1332,9 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "dli        %[tmp0],         0x0001000100010001                   \n\t"
       "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1352,8 +1352,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1380,8 +1380,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1399,8 +1399,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1427,8 +1427,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1446,8 +1446,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1474,8 +1474,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1493,8 +1493,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1533,7 +1533,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -1545,7 +1545,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
         [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
@@ -1553,7 +1553,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
       : "memory");
 }
 
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest0, dest1, dest2, dest3;
   const uint64_t value = 0x1080;
@@ -1561,8 +1561,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1574,8 +1574,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
       "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1587,8 +1587,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
       "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1600,8 +1600,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
       "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1619,20 +1619,20 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
 
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
+      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
       "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
       "daddi      %[width],        %[width],         -0x08          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
         [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
         [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
         [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
         [zero] "f"(0x00)
       : "memory");
 }
 
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -1648,9 +1648,9 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "dli        %[tmp0],         0x0001000100010001                   \n\t"
       "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1668,8 +1668,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1696,8 +1696,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1715,8 +1715,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1743,8 +1743,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1762,8 +1762,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1790,8 +1790,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1809,8 +1809,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1849,7 +1849,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -1861,7 +1861,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
         [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
@@ -1869,7 +1869,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
       : "memory");
 }
 
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest0, dest1, dest2, dest3;
   const uint64_t value = 0x1080;
@@ -1877,8 +1877,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1891,8 +1891,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
       "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1905,8 +1905,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
       "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1919,8 +1919,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
       "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -1939,20 +1939,20 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
 
-      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
+      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
       "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
       "daddi      %[width],        %[width],         -0x08          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
         [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
         [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
         [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
         [zero] "f"(0x00)
       : "memory");
 }
 
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
@@ -1968,9 +1968,9 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "dli        %[tmp0],         0x0001000100010001                   \n\t"
       "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -1990,8 +1990,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2020,8 +2020,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2041,8 +2041,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2071,8 +2071,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2092,8 +2092,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2122,8 +2122,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2143,8 +2143,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2185,7 +2185,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -2197,7 +2197,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
         [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
@@ -2205,7 +2205,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
       : "memory");
 }
 
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest0, dest1, dest2, dest3;
   const uint64_t value = 0x1080;
@@ -2213,8 +2213,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
 
   __asm__ volatile(
       "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -2227,8 +2227,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
       "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -2241,8 +2241,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
       "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -2255,8 +2255,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
       "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
 
-      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
+      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
+      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
       "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
       "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
       "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
@@ -2275,20 +2275,20 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
       "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
       "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
 
-      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
+      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
       "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
       "daddi      %[width],        %[width],         -0x08          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
         [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
         [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
         [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
         [zero] "f"(0x00)
       : "memory");
 }
 
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -2304,9 +2304,9 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "dli        %[tmp0],         0x0001000100010001                   \n\t"
       "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2326,8 +2326,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2356,8 +2356,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2377,8 +2377,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2407,8 +2407,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2428,8 +2428,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2458,8 +2458,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2479,8 +2479,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2521,7 +2521,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -2533,7 +2533,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
         [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
@@ -2541,7 +2541,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
       : "memory");
 }
 
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   uint64_t src, src_hi, src_lo;
   uint64_t dest, dest0, dest1, dest2, dest3;
   uint64_t tmp0, tmp1;
@@ -2618,13 +2618,13 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
         [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
         [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
         [tmp1] "=&f"(tmp1)
-      : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
+      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
         [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
         [width] "r"(width)
       : "memory");
 }
 
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
@@ -2637,9 +2637,9 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
 
   __asm__ volatile(
       "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
+      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
+      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2655,8 +2655,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2681,8 +2681,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2698,8 +2698,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2724,8 +2724,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2741,8 +2741,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2767,8 +2767,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
       "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
 
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2784,8 +2784,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
       "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
 
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
+      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
+      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
       "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
       "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
       "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
@@ -2822,7 +2822,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
       "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
       "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
 
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
+      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
       "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
       "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
       "daddi      %[width],        %[width],         -0x10              \n\t"
@@ -2833,7 +2833,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
         [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
         [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
         [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
         [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
         [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
         [zero] "f"(0x00), [eight] "f"(0x08),
@@ -4386,7 +4386,7 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb,
       : "memory");
 }
 
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
@@ -4422,12 +4422,12 @@ void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
         [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
         [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
         [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
         [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
       : "memory");
 }
 
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
+void ARGBAddRow_MMI(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
@@ -4449,12 +4449,12 @@ void ARGBAddRow_MMI(const uint8_t* src_argb0,
       "daddi      %[width],        %[width],         -0x02          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
         [dst_ptr] "r"(dst_argb), [width] "r"(width)
       : "memory");
 }
 
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+void ARGBSubtractRow_MMI(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
@@ -4476,7 +4476,7 @@ void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
       "daddi      %[width],        %[width],         -0x02          \n\t"
       "bnez       %[width],        1b                               \n\t"
       : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
         [dst_ptr] "r"(dst_argb), [width] "r"(width)
       : "memory");
 }
@@ -5552,10 +5552,10 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
       : "memory");
 }
 
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+void ARGBBlendRow_MMI(const uint8_t* src_argb,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width) {
@@ -5608,7 +5608,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0,
         [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
         [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
         [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
         [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
         [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
         [shift] "f"(shift), [width] "r"(width)
diff --git a/third_party/libyuv/source/row_msa.cc b/third_party/libyuv/source/row_msa.cc
index fe6df93a60..c0b13b0fd0 100644
--- a/third_party/libyuv/source/row_msa.cc
+++ b/third_party/libyuv/source/row_msa.cc
@@ -781,7 +781,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
   }
 }
 
-void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -792,10 +792,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
     vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
@@ -822,18 +822,18 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
     reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
     dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+  const uint8_t* src_argb_next = src_argb + src_stride_argb;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
@@ -847,14 +847,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
   v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
   for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
-    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
-    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
-    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
-    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
     vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -875,14 +875,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
     reg3 = __msa_hadd_u_h(vec5, vec5);
     reg4 = __msa_hadd_u_h(vec0, vec0);
     reg5 = __msa_hadd_u_h(vec1, vec1);
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
-    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
-    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
-    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
-    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
     vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -945,8 +945,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
     dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
     ST_UB(dst0, dst_u);
     ST_UB(dst1, dst_v);
-    src_argb0 += 128;
-    src_argb0_next += 128;
+    src_argb += 128;
+    src_argb_next += 128;
     dst_u += 16;
     dst_v += 16;
   }
@@ -1173,7 +1173,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
   }
 }
 
-void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
@@ -1184,7 +1184,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
   v8i16 zero = {0};
 
   for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
     src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
     vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
@@ -1206,13 +1206,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     ST_UB(dst0, dst_argb);
-    src_argb0 += 16;
+    src_argb += 16;
     src_argb1 += 16;
     dst_argb += 16;
   }
 }
 
-void ARGBAddRow_MSA(const uint8_t* src_argb0,
+void ARGBAddRow_MSA(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
@@ -1220,20 +1220,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0,
   v16u8 src0, src1, src2, src3, dst0, dst1;
 
   for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
     dst0 = __msa_adds_u_b(src0, src2);
     dst1 = __msa_adds_u_b(src1, src3);
     ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb0 += 32;
+    src_argb += 32;
     src_argb1 += 32;
     dst_argb += 32;
   }
 }
 
-void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+void ARGBSubtractRow_MSA(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
@@ -1241,14 +1241,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
   v16u8 src0, src1, src2, src3, dst0, dst1;
 
   for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
     dst0 = __msa_subs_u_b(src0, src2);
     dst1 = __msa_subs_u_b(src1, src3);
     ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb0 += 32;
+    src_argb += 32;
     src_argb1 += 32;
     dst_argb += 32;
   }
@@ -1794,7 +1794,7 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   }
 }
 
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
   v8u16 vec0, vec1, vec2, vec3;
@@ -1809,9 +1809,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v16i8 zero = {0};
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
     reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
     reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
     reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1830,12 +1830,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 48;
+    src_argb += 48;
     dst_y += 16;
   }
 }
 
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
   v8u16 vec0, vec1, vec2, vec3;
@@ -1850,9 +1850,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v16i8 zero = {0};
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
     reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
     reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
     reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1871,7 +1871,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 48;
+    src_argb += 48;
     dst_y += 16;
   }
 }
@@ -2037,14 +2037,14 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
   }
 }
 
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   int64_t res0, res1;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
@@ -2147,14 +2147,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
   }
 }
 
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   int64_t res0, res1;
   v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -2446,7 +2446,7 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
   }
 }
 
-void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
@@ -2454,19 +2454,19 @@ void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
@@ -2474,19 +2474,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
@@ -2494,19 +2494,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
@@ -2514,26 +2514,26 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
   v8u16 vec0, vec1, vec2, vec3;
   v8u16 dst0, dst1, dst2, dst3;
@@ -2658,14 +2658,14 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
   }
 }
 
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   const uint8_t unused = 0xf;
   v8u16 src0, src1, src2, src3;
   v16u8 dst0, dst1;
@@ -2693,14 +2693,14 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
   }
 }
 
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   const uint8_t unused = 0xf;
   v8u16 src0, src1, src2, src3;
   v16u8 dst0, dst1;
@@ -2728,14 +2728,14 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
   }
 }
 
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   const uint8_t unused = 0xf;
   v8u16 src0, src1, src2, src3;
   v16u8 dst0, dst1;
@@ -3109,7 +3109,7 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
   }
 }
 
-void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+void ARGBBlendRow_MSA(const uint8_t* src_argb,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width) {
@@ -3123,8 +3123,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
   v16i8 zero = {0};
 
   for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
     vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
@@ -3168,7 +3168,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
     dst0 = __msa_bmnz_v(dst0, const_255, mask);
     dst1 = __msa_bmnz_v(dst1, const_255, mask);
     ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb0 += 32;
+    src_argb += 32;
     src_argb1 += 32;
     dst_argb += 32;
   }
diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc
index 43a2cac752..6ef6f1c463 100644
--- a/third_party/libyuv/source/row_neon.cc
+++ b/third_party/libyuv/source/row_neon.cc
@@ -21,90 +21,115 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
+// q0: Y uint16x8_t
+// d2: U uint8x8_t
+// d3: V uint8x8_t
+
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                               \
-  "vld1.8     {d0}, [%0]!                    \n" \
-  "vld1.32    {d2[0]}, [%1]!                 \n" \
-  "vld1.32    {d2[1]}, [%2]!                 \n"
+  "vld1.8     {d0}, [%[src_y]]!              \n" \
+  "vld1.32    {d2[0]}, [%[src_u]]!           \n" \
+  "vld1.32    {d2[1]}, [%[src_v]]!           \n" \
+  "vmov.u8    d1, d0                         \n" \
+  "vmovl.u8   q1, d2                         \n" \
+  "vzip.u8    d0, d1                         \n" \
+  "vsli.u16   q1, q1, #8                     \n"
 
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                               \
-  "vld1.8     {d0}, [%0]!                    \n" \
-  "vld1.8     {d2}, [%1]!                    \n" \
-  "vld1.8     {d3}, [%2]!                    \n" \
-  "vpaddl.u8  q1, q1                         \n" \
-  "vrshrn.u16 d2, q1, #1                     \n"
+  "vld1.8     {d0}, [%[src_y]]!              \n" \
+  "vld1.8     {d2}, [%[src_u]]!              \n" \
+  "vmovl.u8   q0, d0                         \n" \
+  "vld1.8     {d3}, [%[src_v]]!              \n" \
+  "vsli.u16   q0, q0, #8                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
 #define READYUV400                               \
-  "vld1.8     {d0}, [%0]!                    \n" \
-  "vmov.u8    d2, #128                       \n"
+  "vld1.8     {d0}, [%[src_y]]!              \n" \
+  "vmov.u8    q1, #128                       \n" \
+  "vmovl.u8   q0, d0                         \n" \
+  "vsli.u16   q0, q0, #8                     \n"
 
 // Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-  "vld1.8     {d0}, [%0]!                    \n"                               \
-  "vld1.8     {d2}, [%1]!                    \n"                               \
-  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
-  "vuzp.u8    d2, d3                         \n"                               \
-  "vtrn.u32   d2, d3                         \n"
+#define READNV12                                                              \
+  "vld1.8     {d0}, [%[src_y]]!              \n"                              \
+  "vld1.8     {d2}, [%[src_uv]]!             \n"                              \
+  "vmov.u8    d1, d0                         \n"                              \
+  "vmov.u8    d3, d2                         \n"                              \
+  "vzip.u8    d0, d1                         \n"                              \
+  "vsli.u16   d2, d2, #8                     \n" /* Duplicate low byte (U) */ \
+  "vsri.u16   d3, d3, #8                     \n" /* Duplicate high byte (V) */
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
-  "vld1.8     {d0}, [%0]!                    \n"                               \
-  "vld1.8     {d2}, [%1]!                    \n"                               \
-  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
-  "vuzp.u8    d3, d2                         \n"                               \
-  "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%[src_y]]!              \n"                               \
+  "vld1.8     {d2}, [%[src_vu]]!             \n"                               \
+  "vmov.u8    d1, d0                         \n"                               \
+  "vmov.u8    d3, d2                         \n"                               \
+  "vzip.u8    d0, d1                         \n"                               \
+  "vsri.u16   d2, d2, #8                     \n" /* Duplicate high byte (U) */ \
+  "vsli.u16   d3, d3, #8                     \n" /* Duplicate low byte (V) */
 
 // Read 8 YUY2
 #define READYUY2                                 \
-  "vld2.8     {d0, d2}, [%0]!                \n" \
+  "vld2.8     {d0, d2}, [%[src_yuy2]]!       \n" \
+  "vmovl.u8   q0, d0                         \n" \
   "vmov.u8    d3, d2                         \n" \
-  "vuzp.u8    d2, d3                         \n" \
-  "vtrn.u32   d2, d3                         \n"
+  "vsli.u16   q0, q0, #8                     \n" \
+  "vsli.u16   d2, d2, #8                     \n" \
+  "vsri.u16   d3, d3, #8                     \n"
 
 // Read 8 UYVY
 #define READUYVY                                 \
-  "vld2.8     {d2, d3}, [%0]!                \n" \
-  "vmov.u8    d0, d3                         \n" \
+  "vld2.8     {d2, d3}, [%[src_uyvy]]!       \n" \
+  "vmovl.u8   q0, d3                         \n" \
   "vmov.u8    d3, d2                         \n" \
-  "vuzp.u8    d2, d3                         \n" \
-  "vtrn.u32   d2, d3                         \n"
-
-#define YUVTORGB_SETUP                             \
-  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
-  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
-  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
-  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
-  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
-  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
-
-#define YUVTORGB                                                              \
-  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
-  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
-  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
-  "vmovl.s16  q10, d1                        \n"                              \
-  "vmovl.s16  q0, d0                         \n"                              \
-  "vmul.s32   q10, q10, q15                  \n"                              \
-  "vmul.s32   q0, q0, q15                    \n"                              \
-  "vqshrun.s32 d0, q0, #16                   \n"                              \
-  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
-  "vadd.s16   d18, d19                       \n"                              \
-  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
-  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
-  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
-  "vaddw.u16  q1, q1, d16                    \n"                              \
-  "vaddw.u16  q10, q10, d17                  \n"                              \
-  "vaddw.u16  q3, q3, d18                    \n"                              \
-  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
-  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
-  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
-  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
-  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
-  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
-  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
-  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
-  "vqshrun.s16 d21, q0, #6                   \n" /* G */
+  "vsli.u16   q0, q0, #8                     \n" \
+  "vsli.u16   d2, d2, #8                     \n" \
+  "vsri.u16   d3, d3, #8                     \n"
+
+#define YUVTORGB_SETUP                                        \
+  "vld4.8     {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
+  "vld1.16    {d31[]}, [%[kRGBCoeffBias]]!   \n"              \
+  "vld1.16    {d20[], d21[]}, [%[kRGBCoeffBias]]! \n"         \
+  "vld1.16    {d22[], d23[]}, [%[kRGBCoeffBias]]! \n"         \
+  "vld1.16    {d24[], d25[]}, [%[kRGBCoeffBias]] \n"
+
+// q0: B uint16x8_t
+// q1: G uint16x8_t
+// q2: R uint16x8_t
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB                                           \
+  "vmull.u16  q2, d1, d31                    \n"           \
+  "vmull.u8   q8, d3, d29                    \n" /* DGV */ \
+  "vmull.u16  q0, d0, d31                    \n"           \
+  "vmlal.u8   q8, d2, d28                    \n" /* DG */  \
+  "vqshrn.u32 d0, q0, #16                    \n"           \
+  "vqshrn.u32 d1, q2, #16                    \n" /* Y */   \
+  "vmull.u8   q9, d2, d26                    \n" /* DB */  \
+  "vmull.u8   q2, d3, d27                    \n" /* DR */  \
+  "vadd.u16   q4, q0, q11                    \n" /* G */   \
+  "vadd.u16   q2, q0, q2                     \n" /* R */   \
+  "vadd.u16   q0, q0, q9                     \n" /* B */   \
+  "vqsub.u16  q1, q4, q8                     \n" /* G */   \
+  "vqsub.u16  q0, q0, q10                    \n" /* B */   \
+  "vqsub.u16  q2, q2, q12                    \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8                                        \
+  "vqshrn.u16 d4, q2, #6                     \n" /* R */ \
+  "vqshrn.u16 d2, q1, #6                     \n" /* G */ \
+  "vqshrn.u16 d0, q0, #6                     \n" /* B */
+
+#define YUVTORGB_REGS \
+  "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
+
+#define STORERGBA                                \
+  "vmov.u8    d1, d0                         \n" \
+  "vmov.u8    d3, d4                         \n" \
+  "vmov.u8    d0, d6                         \n" \
+  "vst4.8     {d0, d1, d2, d3}, [%[dst_rgba]]! \n"
 
 void I444ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
@@ -114,22 +139,20 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
+      "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV444 YUVTORGB
-      "subs        %4, %4, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%3]!   \n"
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
       "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void I422ToARGBRow_NEON(const uint8_t* src_y,
@@ -140,22 +163,20 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
+      "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs        %4, %4, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%3]!   \n"
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
       "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
@@ -168,22 +189,20 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 YUVTORGB
-      "vld1.8      {d23}, [%3]!                  \n"
-      "subs        %5, %5, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%4]!   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+          RGBTORGB8
+      "vld1.8      {d6}, [%[src_a]]!             \n"
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
@@ -196,22 +215,20 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs        %5, %5, #8                    \n"
-      "vld1.8      {d23}, [%3]!                  \n"
-      "vst4.8      {d20, d21, d22, d23}, [%4]!   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+          RGBTORGB8
+      "vld1.8      {d6}, [%[src_a]]!             \n"
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void I422ToRGBARow_NEON(const uint8_t* src_y,
@@ -222,22 +239,18 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs        %4, %4, #8                    \n"
-      "vmov.u8     d19, #255                     \n"  // YUVTORGB modified d19
-      "vst4.8      {d19, d20, d21, d22}, [%3]!   \n"
+      RGBTORGB8 "subs        %[width], %[width], #8        \n" STORERGBA
       "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_rgba),  // %3
-        "+r"(width)      // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void I422ToRGB24Row_NEON(const uint8_t* src_y,
@@ -248,29 +261,28 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          int width) {
   asm volatile(
       YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs        %4, %4, #8                    \n"
-      "vst3.8      {d20, d21, d22}, [%3]!        \n"
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
       "bgt         1b                            \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_u),      // %1
-        "+r"(src_v),      // %2
-        "+r"(dst_rgb24),  // %3
-        "+r"(width)       // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 #define ARGBTORGB565                                                        \
-  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
-  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
-  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
-  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
-  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
+  "vshll.u8    q2, d4, #8                    \n" /* R                    */ \
+  "vshll.u8    q1, d2, #8                    \n" /* G                    */ \
+  "vshll.u8    q0, d0, #8                    \n" /* B                    */ \
+  "vsri.16     q2, q1, #5                    \n" /* RG                   */ \
+  "vsri.16     q2, q0, #11                   \n" /* RGB                  */
 
 void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           const uint8_t* src_u,
@@ -280,31 +292,29 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           int width) {
   asm volatile(
       YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs        %4, %4, #8                    \n" ARGBTORGB565
-      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels RGB565.
-      "bgt         1b                            \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_u),       // %1
-        "+r"(src_v),       // %2
-        "+r"(dst_rgb565),  // %3
-        "+r"(width)        // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      RGBTORGB8 "subs        %[width], %[width], #8        \n" ARGBTORGB565
+      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 #define ARGBTOARGB1555                                                      \
-  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
-  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
-  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
-  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
-  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
-  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
-  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
+  "vshll.u8    q3, d6, #8                    \n" /* A                    */ \
+  "vshll.u8    q2, d4, #8                    \n" /* R                    */ \
+  "vshll.u8    q1, d2, #8                    \n" /* G                    */ \
+  "vshll.u8    q0, d0, #8                    \n" /* B                    */ \
+  "vsri.16     q3, q2, #1                    \n" /* AR                   */ \
+  "vsri.16     q3, q1, #6                    \n" /* ARG                  */ \
+  "vsri.16     q3, q0, #11                   \n" /* ARGB                 */
 
 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
@@ -315,30 +325,28 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
-      "subs        %4, %4, #8                    \n"
-      "vmov.u8     d23, #255                     \n" ARGBTOARGB1555
-      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels
-      "bgt         1b                            \n"
-      : "+r"(src_y),         // %0
-        "+r"(src_u),         // %1
-        "+r"(src_v),         // %2
-        "+r"(dst_argb1555),  // %3
-        "+r"(width)          // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vmov.u8     d6, #0xff                     \n" ARGBTOARGB1555
+      "vst1.8      {q3}, [%[dst_argb1555]]!      \n"  // store 8 pixels RGB1555.
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "q3");
 }
 
 #define ARGBTOARGB4444                                                      \
-  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
-  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
-  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
-  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
-  "vorr       d0, d20, d21                   \n" /* BG                   */ \
-  "vorr       d1, d22, d23                   \n" /* RA                   */ \
+  "vshr.u8    d0, d0, #4                     \n" /* B                    */ \
+  "vbic.32    d2, d2, d7                     \n" /* G                    */ \
+  "vshr.u8    d4, d4, #4                     \n" /* R                    */ \
+  "vbic.32    d6, d6, d7                     \n" /* A                    */ \
+  "vorr       d0, d0, d2                     \n" /* BG                   */ \
+  "vorr       d1, d4, d6                     \n" /* RA                   */ \
   "vzip.u8    d0, d1                         \n" /* BGRA                 */
 
 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
@@ -349,25 +357,21 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8     d4, #0x0f                     \n"  // vbic bits to clear
-      "1:                                        \n"
-
-      READYUV422 YUVTORGB
-      "subs        %4, %4, #8                    \n"
-      "vmov.u8     d23, #255                     \n" ARGBTOARGB4444
-      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels
-      "bgt         1b                            \n"
-      : "+r"(src_y),         // %0
-        "+r"(src_u),         // %1
-        "+r"(src_v),         // %2
-        "+r"(dst_argb4444),  // %3
-        "+r"(width)          // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      "vmov.u8     d6, #255                      \n"
+      "vmov.u8     d7, #0x0f                     \n"  // vbic bits to clear
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n" ARGBTOARGB4444
+      "vst1.8      {q0}, [%[dst_argb4444]]!      \n"  // store 8 pixels
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "q3");
 }
 
 void I400ToARGBRow_NEON(const uint8_t* src_y,
@@ -376,20 +380,18 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                         int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
+      "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV400 YUVTORGB
-      "subs        %2, %2, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
       "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
@@ -414,22 +416,20 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
-      "1:                                        \n" READNV12 YUVTORGB
-      "subs        %3, %3, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%2]!   \n"
-      "bgt         1b                            \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_uv),    // %1
-                 "+r"(dst_argb),  // %2
-                 "+r"(width)      // %3
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_uv] "+r"(src_uv),                             // %[src_uv]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void NV21ToARGBRow_NEON(const uint8_t* src_y,
@@ -437,22 +437,20 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
-      "1:                                        \n" READNV21 YUVTORGB
-      "subs        %3, %3, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%2]!   \n"
-      "bgt         1b                            \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_vu),    // %1
-                 "+r"(dst_argb),  // %2
-                 "+r"(width)      // %3
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_vu] "+r"(src_vu),                             // %[src_vu]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
@@ -461,25 +459,19 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          const struct YuvConstants* yuvconstants,
                          int width) {
   asm volatile(
-
       YUVTORGB_SETUP
-
-      "1:                                        \n"
-
-      READNV12 YUVTORGB
-      "subs        %3, %3, #8                    \n"
-      "vst3.8      {d20, d21, d22}, [%2]!        \n"
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
       "bgt         1b                            \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_uv),     // %1
-        "+r"(dst_rgb24),  // %2
-        "+r"(width)       // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_uv] "+r"(src_uv),                             // %[src_uv]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
@@ -488,25 +480,19 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          const struct YuvConstants* yuvconstants,
                          int width) {
   asm volatile(
-
       YUVTORGB_SETUP
-
-      "1:                                        \n"
-
-      READNV21 YUVTORGB
-      "subs        %3, %3, #8                    \n"
-      "vst3.8      {d20, d21, d22}, [%2]!        \n"
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
       "bgt         1b                            \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_vu),     // %1
-        "+r"(dst_rgb24),  // %2
-        "+r"(width)       // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_vu] "+r"(src_vu),                             // %[src_vu]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
@@ -516,62 +502,56 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           int width) {
   asm volatile(
       YUVTORGB_SETUP
-      "1:                                        \n" READNV12 YUVTORGB
-      "subs        %3, %3, #8                    \n" ARGBTORGB565
-      "vst1.8      {q0}, [%2]!                   \n"  // store 8 pixels RGB565.
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n" ARGBTORGB565
+      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
       "bgt         1b                            \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_uv),      // %1
-        "+r"(dst_rgb565),  // %2
-        "+r"(width)        // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_uv] "+r"(src_uv),                             // %[src_uv]
+        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
-      "1:                                        \n" READYUY2 YUVTORGB
-      "subs        %2, %2, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
       "bgt         1b                            \n"
-               : "+r"(src_yuy2),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
+      : [src_yuy2] "+r"(src_yuy2),                         // %[src_yuy2]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(YUVTORGB_SETUP
-      "vmov.u8     d23, #255                     \n"
-      "1:                                        \n" READUYVY YUVTORGB
-      "subs        %2, %2, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
       "bgt         1b                            \n"
-               : "+r"(src_uyvy),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
+      : [src_uyvy] "+r"(src_uyvy),                         // %[src_uyvy]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
@@ -760,8 +740,8 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
       "vld1.8      {q1}, [%1]!                   \n"  // load G
       "vld1.8      {q0}, [%2]!                   \n"  // load B
       "subs        %4, %4, #16                   \n"  // 16 processed per loop
-      "vst4.8      {d0, d2, d4, d6}, [%4]!       \n"  // store 8 ARGB
-      "vst4.8      {d1, d3, d5, d7}, [%4]!       \n"  // next 8 ARGB
+      "vst4.8      {d0, d2, d4, d6}, [%3]!       \n"  // store 8 ARGB
+      "vst4.8      {d1, d3, d5, d7}, [%3]!       \n"  // next 8 ARGB
       "bgt         1b                            \n"
       : "+r"(src_r),                            // %0
         "+r"(src_g),                            // %1
@@ -773,6 +753,226 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
   );
 }
 
+void MergeXR30Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width) {
+  int shift = 10 - depth;
+  asm volatile(
+      "vmov.u32    q14, #1023                    \n"
+      "vdup.32     q15, %5                       \n"
+      "1:                                        \n"
+      "vld1.16     {d4}, [%2]!                   \n"  // B
+      "vld1.16     {d2}, [%1]!                   \n"  // G
+      "vld1.16     {d0}, [%0]!                   \n"  // R
+      "vmovl.u16   q2, d4                        \n"  // B
+      "vmovl.u16   q1, d2                        \n"  // G
+      "vmovl.u16   q0, d0                        \n"  // R
+      "vshl.u32    q2, q2, q15                   \n"  // 000B
+      "vshl.u32    q1, q1, q15                   \n"
+      "vshl.u32    q0, q0, q15                   \n"
+      "vmin.u32    q2, q2, q14                   \n"
+      "vmin.u32    q1, q1, q14                   \n"
+      "vmin.u32    q0, q0, q14                   \n"
+      "vsli.u32    q2, q1, #10                   \n"  // 00GB
+      "vsli.u32    q2, q0, #20                   \n"  // 0RGB
+      "vorr.u32    q2, #0xc0000000               \n"  // ARGB (AR30)
+      "subs        %4, %4, #4                    \n"
+      "vst1.8      {q2}, [%3]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "q0", "q1", "q2", "q14", "q15");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+                          const uint16_t* src_g,
+                          const uint16_t* src_b,
+                          uint8_t* dst_ar30,
+                          int /* depth */,
+                          int width) {
+  asm volatile(
+      "vmov.u32    q14, #1023                    \n"
+      "1:                                        \n"
+      "vld1.16     {d4}, [%2]!                   \n"  // B
+      "vld1.16     {d2}, [%1]!                   \n"  // G
+      "vld1.16     {d0}, [%0]!                   \n"  // R
+      "vmovl.u16   q2, d4                        \n"  // 000B
+      "vmovl.u16   q1, d2                        \n"  // G
+      "vmovl.u16   q0, d0                        \n"  // R
+      "vmin.u32    q2, q2, q14                   \n"
+      "vmin.u32    q1, q1, q14                   \n"
+      "vmin.u32    q0, q0, q14                   \n"
+      "vsli.u32    q2, q1, #10                   \n"  // 00GB
+      "vsli.u32    q2, q0, #20                   \n"  // 0RGB
+      "vorr.u32    q2, #0xc0000000               \n"  // ARGB (AR30)
+      "subs        %4, %4, #4                    \n"
+      "vst1.8      {q2}, [%3]!                   \n"
+      "bgt         1b                            \n"
+      "3:                                        \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q14");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "vdup.u16    q15, %6                       \n"
+      "vdup.u16    q14, %7                       \n"
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vld1.16     {q3}, [%3]!                   \n"  // A
+      "vmin.u16    q2, q2, q14                   \n"
+      "vmin.u16    q1, q1, q14                   \n"
+      "vmin.u16    q0, q0, q14                   \n"
+      "vmin.u16    q3, q3, q14                   \n"
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "vshl.u16    q3, q3, q15                   \n"
+      "subs        %5, %5, #8                    \n"
+      "vst4.16     {d0, d2, d4, d6}, [%4]!       \n"
+      "vst4.16     {d1, d3, d5, d7}, [%4]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_ar64),  // %4
+        "+r"(width)      // %5
+      : "r"(shift),      // %6
+        "r"(mask)        // %7
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "vmov.u8     q3, #0xff                     \n"  // A (0xffff)
+      "vdup.u16    q15, %5                       \n"
+      "vdup.u16    q14, %6                       \n"
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vmin.u16    q2, q2, q14                   \n"
+      "vmin.u16    q1, q1, q14                   \n"
+      "vmin.u16    q0, q0, q14                   \n"
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "subs        %4, %4, #8                    \n"
+      "vst4.16     {d0, d2, d4, d6}, [%3]!       \n"
+      "vst4.16     {d1, d3, d5, d7}, [%3]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar64),  // %3
+        "+r"(width)      // %4
+      : "r"(shift),      // %5
+        "r"(mask)        // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "vdup.16     q15, %6                       \n"
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vld1.16     {q3}, [%3]!                   \n"  // A
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "vshl.u16    q3, q3, q15                   \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vqmovn.u16  d1, q1                        \n"
+      "vqmovn.u16  d2, q2                        \n"
+      "vqmovn.u16  d3, q3                        \n"
+      "subs        %5, %5, #8                    \n"
+      "vst4.8      {d0, d1, d2, d3}, [%4]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : "r"(shift)       // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "vdup.16     q15, %5                       \n"
+      "vmov.u8     d6, #0xff                     \n"  // A (0xff)
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "vqmovn.u16  d5, q2                        \n"
+      "vqmovn.u16  d4, q1                        \n"
+      "vqmovn.u16  d3, q0                        \n"
+      "subs        %4, %4, #8                    \n"
+      "vst4.u8     {d3, d4, d5, d6}, [%3]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "q0", "q1", "q2", "d6", "q15");
+}
+
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
@@ -1328,16 +1528,16 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGBTORGB565
-      "vst1.8      {q0}, [%1]!                   \n"  // store 8 pixels RGB565.
+      "vst1.8      {q2}, [%1]!                   \n"  // store 8 pixels RGB565.
       "bgt         1b                            \n"
       : "+r"(src_argb),    // %0
         "+r"(dst_rgb565),  // %1
         "+r"(width)        // %2
       :
-      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+      : "cc", "memory", "q0", "q1", "q2", "d6");
 }
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
@@ -1345,21 +1545,21 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 const uint32_t dither4,
                                 int width) {
   asm volatile(
-      "vdup.32     d2, %2                        \n"  // dither4
+      "vdup.32     d7, %2                        \n"  // dither4
       "1:                                        \n"
-      "vld4.8      {d20, d21, d22, d23}, [%1]!   \n"  // load 8 pixels of ARGB.
+      "vld4.8      {d0, d2, d4, d6}, [%1]!       \n"  // load 8 pixels of ARGB.
       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vqadd.u8    d20, d20, d2                  \n"
-      "vqadd.u8    d21, d21, d2                  \n"
-      "vqadd.u8    d22, d22, d2                  \n"  // add for dither
+      "vqadd.u8    d0, d0, d7                    \n"
+      "vqadd.u8    d2, d2, d7                    \n"
+      "vqadd.u8    d4, d4, d7                    \n"  // add for dither
       ARGBTORGB565
-      "vst1.8      {q0}, [%0]!                   \n"  // store 8 RGB565.
+      "vst1.8      {q2}, [%0]!                   \n"  // store 8 RGB565.
       "bgt         1b                            \n"
       : "+r"(dst_rgb)   // %0
       : "r"(src_argb),  // %1
         "r"(dither4),   // %2
         "r"(width)      // %3
-      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
@@ -1367,26 +1567,26 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             int width) {
   asm volatile(
       "1:                                        \n"
-      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGBTOARGB1555
-      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB1555.
+      "vst1.8      {q3}, [%1]!                   \n"  // store 8 ARGB1555.
       "bgt         1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb1555),  // %1
         "+r"(width)          // %2
       :
-      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
   asm volatile(
-      "vmov.u8     d4, #0x0f                     \n"  // bits to clear with
+      "vmov.u8     d7, #0x0f                     \n"  // bits to clear with
                                                       // vbic.
       "1:                                        \n"
-      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
       ARGBTOARGB4444
       "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB4444.
@@ -1395,7 +1595,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
         "+r"(dst_argb4444),  // %1
         "+r"(width)          // %2
       :
-      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1460,7 +1660,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
-void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
       "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
       "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
@@ -1474,7 +1674,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
       "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
+      : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       :
@@ -2119,6 +2319,105 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
       : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
 }
 
+static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
+                                         10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q2}, [%0]!                   \n"
+      "vmov.u8     q1, q0                        \n"
+      "vmov.u8     q3, q2                        \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst2.8      {q0, q1}, [%1]!               \n"  // store 4 pixels
+      "vst2.8      {q2, q3}, [%1]!               \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+      "vld1.8      q4, %3                        \n"  // shuffler
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q2}, [%0]!                   \n"
+      "vtbl.8      d2, {d0, d1}, d8              \n"
+      "vtbl.8      d3, {d0, d1}, d9              \n"
+      "vtbl.8      d6, {d4, d5}, d8              \n"
+      "vtbl.8      d7, {d4, d5}, d9              \n"
+      "vmov.u8     q0, q1                        \n"
+      "vmov.u8     q2, q3                        \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst2.8      {q0, q1}, [%1]!               \n"  // store 4 pixels
+      "vst2.8      {q2, q3}, [%1]!               \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_ab64),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vld1.16     {q2}, [%0]!                   \n"
+      "vld1.16     {q3}, [%0]!                   \n"
+      "vshrn.u16   d0, q0, #8                    \n"
+      "vshrn.u16   d1, q1, #8                    \n"
+      "vshrn.u16   d4, q2, #8                    \n"
+      "vshrn.u16   d5, q3, #8                    \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 4 pixels
+      "vst1.8      {q2}, [%1]!                   \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "vld1.8      d8, %3                        \n"  // shuffler
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vld1.16     {q2}, [%0]!                   \n"
+      "vld1.16     {q3}, [%0]!                   \n"
+      "vtbl.8      d0, {d0, d1}, d8              \n"
+      "vtbl.8      d1, {d2, d3}, d8              \n"
+      "vtbl.8      d4, {d4, d5}, d8              \n"
+      "vtbl.8      d5, {d6, d7}, d8              \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 4 pixels
+      "vst1.8      {q2}, [%1]!                   \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAB64ToARGB)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
   asm volatile(
       "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
@@ -2263,9 +2562,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
       "1:                                        \n"
       "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q4, d0, d4                    \n"  // B
+      "vmull.u8    q4, d0, d4                    \n"  // R
       "vmlal.u8    q4, d1, d5                    \n"  // G
-      "vmlal.u8    q4, d2, d6                    \n"  // R
+      "vmlal.u8    q4, d2, d6                    \n"  // B
       "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
       "bgt         1b                            \n"
@@ -2336,7 +2635,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
@@ -2387,7 +2686,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
 
       "99:                                       \n"
 
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2625,7 +2924,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
 }
 
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -2645,7 +2944,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
       "vrshrn.u16  d3, q3, #8                    \n"  // 16 bit to 8 bit A
       "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
       "bgt         1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2654,7 +2953,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
@@ -2668,7 +2967,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
       "vqadd.u8    q1, q1, q3                    \n"  // add R, A
       "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
       "bgt         1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2677,7 +2976,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -2691,7 +2990,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
       "vqsub.u8    q1, q1, q3                    \n"  // subtract R, A
       "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
       "bgt         1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -3171,32 +3470,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         uint16_t* dst_v,
                         int depth,
                         int width) {
+  int shift = depth - 16;  // Negative for right shift.
   asm volatile(
-      "vdup.32     q0, %3                        \n"
+      "vdup.16     q2, %4                        \n"
       "1:                                        \n"
-      "vld2.16     {q1, q2}, [%0]!               \n"  // load 8 UV
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q4, d3                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q4                        \n"
-      "vmovl.u16   q3, d4                        \n"
-      "vmovl.u16   q4, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vmovn.u32   d4, q3                        \n"
-      "vmovn.u32   d5, q4                        \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
-      "vst1.16     {q1}, [%1]!                   \n"  // store 8 U pixels
-      "vst1.16     {q2}, [%2]!                   \n"  // store 8 V pixels
+      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
+      "vshl.u16    q0, q0, q2                    \n"
+      "vshl.u16    q1, q1, q2                    \n"
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
+      "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
       "bgt         1b                            \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_u),   // %1
         "+r"(dst_v),   // %2
-        "+r"(depth),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
 }
 
@@ -3207,21 +3496,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int width) {
   int shift = 16 - depth;
   asm volatile(
-      "vdup.16     q2, %3                        \n"
+      "vdup.16     q2, %4                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
       "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
       "vshl.u16    q0, q0, q2                    \n"
       "vshl.u16    q1, q1, q2                    \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
       "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
       "bgt         1b                            \n"
       : "+r"(src_u),   // %0
         "+r"(src_v),   // %1
         "+r"(dst_uv),  // %2
-        "+r"(shift),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
       : "cc", "memory", "q0", "q1", "q2");
 }
 
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
index 941c9b9805..da7e3c7cd4 100644
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -18,93 +18,101 @@ extern "C" {
 // This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
+// v0.8h: Y
+// v1.16b: 8U, 8V
+
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                               \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v1.s}[0], [%1], #4            \n" \
-  "ld1        {v1.s}[1], [%2], #4            \n"
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "ld1        {v1.s}[0], [%[src_u]], #4      \n" \
+  "ld1        {v1.s}[1], [%[src_v]], #4      \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "zip1       v1.16b, v1.16b, v1.16b         \n" \
+  "prfm       pldl1keep, [%[src_u], 128]     \n" \
+  "prfm       pldl1keep, [%[src_v], 128]     \n"
 
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                               \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v1.d}[0], [%1], #8            \n" \
-  "ld1        {v1.d}[1], [%2], #8            \n" \
-  "uaddlp     v1.8h, v1.16b                  \n" \
-  "rshrn      v1.8b, v1.8h, #1               \n"
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "ld1        {v1.d}[0], [%[src_u]], #8      \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "ld1        {v1.d}[1], [%[src_v]], #8      \n" \
+  "prfm       pldl1keep, [%[src_u], 448]     \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n" \
+  "prfm       pldl1keep, [%[src_v], 448]     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
 #define READYUV400                               \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "movi       v1.8b , #128                   \n"
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "movi       v1.16b, #128                   \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n"
+
+static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
+                                 1, 1, 3, 3, 5, 5, 7, 7};
+static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
+                                 0, 0, 2, 2, 4, 4, 6, 6};
 
-// Read 8 Y and 4 UV from NV12
+// Read 8 Y and 4 UV from NV12 or NV21
 #define READNV12                                 \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v2.8b}, [%1], #8              \n" \
-  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
-  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21                                 \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v2.8b}, [%1], #8              \n" \
-  "uzp1       v3.8b, v2.8b, v2.8b            \n" \
-  "uzp2       v1.8b, v2.8b, v2.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "ldr        d1, [%[src_uv]], #8            \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "tbl        v1.16b, {v1.16b}, v2.16b       \n" \
+  "prfm       pldl1keep, [%[src_uv], 448]    \n"
 
 // Read 8 YUY2
-#define READYUY2                                 \
-  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
-  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
-  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
+#define READYUY2                                     \
+  "ld2        {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n"     \
+  "prfm       pldl1keep, [%[src_yuy2], 448]  \n"     \
+  "tbl        v1.16b, {v1.16b}, v2.16b       \n"
 
 // Read 8 UYVY
-#define READUYVY                                 \
-  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
-  "orr        v0.8b, v3.8b, v3.8b            \n" \
-  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
-  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
-
-#define YUVTORGB_SETUP                                      \
-  "ld3r       {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
-  "ld1r       {v31.4s}, [%[kYToRgb]]                    \n" \
-  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]]            \n" \
-  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]             \n"
-
-// clang-format off
-
-#define YUVTORGB(vR, vG, vB)                                        \
-  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
-  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
-  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
-  "ushll      v0.4s, v0.4h, #0               \n"                    \
-  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
-  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
-  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
-  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
-  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
-  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
-  "uxtl       v2.8h, v2.8b                   \n"                    \
-  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
-  "mul        v3.8h, v27.8h, v1.8h           \n"                    \
-  "mul        v5.8h, v29.8h, v1.8h           \n"                    \
-  "mul        v6.8h, v30.8h, v2.8h           \n"                    \
-  "mul        v7.8h, v28.8h, v2.8h           \n"                    \
-  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
-  "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */            \
-  "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */            \
-  "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */            \
-  "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */            \
-  "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */            \
-  "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */            \
-  "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */            \
-  "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */            \
-  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
-
-// clang-format on
+#define READUYVY                                     \
+  "ld2        {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
+  "zip1       v0.16b, v4.16b, v4.16b         \n"     \
+  "prfm       pldl1keep, [%[src_uyvy], 448]  \n"     \
+  "tbl        v1.16b, {v3.16b}, v2.16b       \n"
+
+// UB VR UG VG
+// YG BB BG BR
+#define YUVTORGB_SETUP                                                \
+  "ld4r       {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
+  "ld4r       {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
+
+// v16.8h: B
+// v17.8h: G
+// v18.8h: R
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB                                          \
+  "umull2     v3.4s, v0.8h, v24.8h           \n"          \
+  "umull      v6.8h, v1.8b, v30.8b           \n"          \
+  "umull      v0.4s, v0.4h, v24.4h           \n"          \
+  "umlal2     v6.8h, v1.16b, v31.16b         \n" /* DG */ \
+  "uqshrn     v0.4h, v0.4s, #16              \n"          \
+  "uqshrn2    v0.8h, v3.4s, #16              \n" /* Y */  \
+  "umull      v4.8h, v1.8b, v28.8b           \n" /* DB */ \
+  "umull2     v5.8h, v1.16b, v29.16b         \n" /* DR */ \
+  "add        v17.8h, v0.8h, v26.8h          \n" /* G */  \
+  "add        v16.8h, v0.8h, v4.8h           \n" /* B */  \
+  "add        v18.8h, v0.8h, v5.8h           \n" /* R */  \
+  "uqsub      v17.8h, v17.8h, v6.8h          \n" /* G */  \
+  "uqsub      v16.8h, v16.8h, v25.8h         \n" /* B */  \
+  "uqsub      v18.8h, v18.8h, v27.8h         \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8                                \
+  "uqshrn     v17.8b, v17.8h, #6             \n" \
+  "uqshrn     v16.8b, v16.8h, #6             \n" \
+  "uqshrn     v18.8b, v18.8h, #6             \n"
+
+#define YUVTORGB_REGS                                                          \
+  "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
+      "v26", "v27", "v28", "v29", "v30", "v31"
 
 void I444ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
@@ -112,30 +120,22 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n" /* A */
-      "1:                                        \n"
-    READYUV444
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "subs        %w4, %w4, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n" /* A */
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
 void I422ToARGBRow_NEON(const uint8_t* src_y,
@@ -144,31 +144,22 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n" /* A */
-
-      "1:                                        \n"
-    READYUV422
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "subs        %w4, %w4, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n" /* A */
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
@@ -178,32 +169,23 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
-    YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
       "1:                                        \n"
-    READYUV444
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "ld1         {v23.8b}, [%3], #8            \n"
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "subs        %w5, %w5, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+      "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV444
+      "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
@@ -213,32 +195,23 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
-    YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
       "1:                                        \n"
-    READYUV422
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "ld1         {v23.8b}, [%3], #8            \n"
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "subs        %w5, %w5, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+      "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV422
+      "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
 void I422ToRGBARow_NEON(const uint8_t* src_y,
@@ -247,30 +220,22 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v20.8b, #255                  \n" /* A */
-      "1:                                        \n"
-    READYUV422
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v23, v22, v21)
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "subs        %w4, %w4, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v15.8b, #255                  \n" /* A */
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v15");
 }
 
 void I422ToRGB24Row_NEON(const uint8_t* src_y,
@@ -279,39 +244,29 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "1:                                        \n"
-    READYUV422
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "subs        %w4, %w4, #8                  \n"
-      "st3         {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgb24), // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 #define ARGBTORGB565                                                        \
-  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
-  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
-  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
-  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
-  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
-
-// clang-format off
+  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
+  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
+  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
+  "sri        v18.8h, v17.8h, #5             \n" /* RG                   */ \
+  "sri        v18.8h, v16.8h, #11            \n" /* RGB                  */
 
 void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           const uint8_t* src_u,
@@ -320,38 +275,29 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   asm volatile(
-    YUVTORGB_SETUP
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w4, %w4, #8                  \n"
-    ARGBTORGB565
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
-      "b.gt        1b                            \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_u),       // %1
-        "+r"(src_v),       // %2
-        "+r"(dst_rgb565),  // %3
-        "+r"(width)        // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      RGBTORGB8 "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
 }
 
 #define ARGBTOARGB1555                                                      \
-  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
-  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
-  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
-  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
-  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
-  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
-  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
+  "shll       v0.8h,  v19.8b, #8             \n" /* A                    */ \
+  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
+  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
+  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v18.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v17.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v16.8h, #11            \n" /* ARGB                 */
 
 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
@@ -360,40 +306,32 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   asm volatile(
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w4, %w4, #8                  \n"
-    ARGBTOARGB1555
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
-      "b.gt        1b                            \n"
-      : "+r"(src_y),         // %0
-        "+r"(src_u),         // %1
-        "+r"(src_v),         // %2
-        "+r"(dst_argb1555),  // %3
-        "+r"(width)          // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n" ARGBTOARGB1555
+      "st1         {v0.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
+                                                        // RGB565.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }
-// clang-format on
 
 #define ARGBTOARGB4444                                                       \
-  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
-  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
-  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
-  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
-  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
-  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
+  /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f       */ \
+  "ushr       v16.8b, v16.8b, #4             \n" /* B                    */  \
+  "bic        v17.8b, v17.8b, v23.8b         \n" /* G                    */  \
+  "ushr       v18.8b, v18.8b, #4             \n" /* R                    */  \
+  "bic        v19.8b, v19.8b, v23.8b         \n" /* A                    */  \
+  "orr        v0.8b,  v16.8b, v17.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v18.8b, v19.8b         \n" /* RA                   */  \
   "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
 
 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
@@ -402,58 +340,46 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v4.16b, #0x0f                 \n"  // bits to clear with vbic.
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w4, %w4, #8                  \n"
-      "movi        v23.8b, #255                  \n"
-    ARGBTOARGB4444
-      "prfm        pldl1keep, [%1, 128]          \n"
-      "prfm        pldl1keep, [%2, 128]          \n"
-      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels ARGB4444.
-      "b.gt        1b                            \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v23.16b, #0x0f                \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "movi        v19.8b, #255                  \n" ARGBTOARGB4444
+      "st1         {v0.8h}, [%[dst_argb4444]], #16 \n"  // store 8
+                                                        // pixels
+                                                        // ARGB4444.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
 }
 
 void I400ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-    READYUV400
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
       "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
@@ -479,28 +405,22 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-    READNV12
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 256]          \n"
-      "subs        %w3, %w3, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_uv),                              // %[src_uv]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
 }
 
 void NV21ToARGBRow_NEON(const uint8_t* src_y,
@@ -508,28 +428,22 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-    READNV21
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 256]          \n"
-      "subs        %w3, %w3, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_vu),                              // %[src_uv]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV21Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
 }
 
 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
@@ -537,27 +451,21 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "1:                                        \n"
-    READNV12
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 256]          \n"
-      "subs        %w3, %w3, #8                  \n"
-      "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb24),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_uv),                              // %[src_uv]
+        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2");
 }
 
 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
@@ -565,27 +473,21 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "1:                                        \n"
-    READNV21
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%1, 256]          \n"
-      "subs        %w3, %w3, #8                  \n"
-      "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_rgb24),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_vu),                              // %[src_uv]
+        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV21Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2");
 }
 
 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
@@ -594,75 +496,64 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   asm volatile(
-      YUVTORGB_SETUP "1:                                        \n" READNV12
-      "prfm        pldl1keep, [%0, 448]          \n" YUVTORGB(
-                         v22, v21, v20) ARGBTORGB565
-      "prfm        pldl1keep, [%1, 256]          \n"
-      "subs        %w3, %w3, #8                  \n"
-      "st1         {v0.8h}, [%2], 16             \n"  // store 8 pixels
+      YUVTORGB_SETUP
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
+                                                       // pixels
+                                                       // RGB565.
       "b.gt        1b                            \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_uv),      // %1
-        "+r"(dst_rgb565),  // %2
-        "+r"(width)        // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_uv),                              // %[src_uv]
+        [dst_rgb565] "+r"(dst_rgb565),                      // %[dst_rgb565]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2");
 }
 
 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-    READYUY2
-      "prfm        pldl1keep, [%0, 448]          \n"
-    YUVTORGB(v22, v21, v20)
-      "subs        %w2, %w2, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
       "b.gt        1b                            \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+      : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
 }
 
 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-    READUYVY
-    YUVTORGB(v22, v21, v20)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
-      "b.gt        1b                            \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
@@ -673,8 +564,8 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
   asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v0.16b}, [%1], #16           \n"  // store U
       "st1         {v1.16b}, [%2], #16           \n"  // store V
       "b.gt        1b                            \n"
@@ -696,9 +587,9 @@ void MergeUVRow_NEON(const uint8_t* src_u,
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load U
       "ld1         {v1.16b}, [%1], #16           \n"  // load V
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
       "prfm        pldl1keep, [%0, 448]          \n"
       "prfm        pldl1keep, [%1, 448]          \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
       "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
       "b.gt        1b                            \n"
       : "+r"(src_u),                // %0
@@ -719,8 +610,8 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
   asm volatile(
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v0.16b}, [%1], #16           \n"  // store R
       "st1         {v1.16b}, [%2], #16           \n"  // store G
       "st1         {v2.16b}, [%3], #16           \n"  // store B
@@ -746,12 +637,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
       "ld1         {v0.16b}, [%0], #16           \n"  // load R
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
       "ld1         {v2.16b}, [%2], #16           \n"  // load B
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
       "prfm        pldl1keep, [%0, 448]          \n"
       "prfm        pldl1keep, [%1, 448]          \n"
       "prfm        pldl1keep, [%2, 448]          \n"
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
       "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "b.gt        1b                            \n"
       : "+r"(src_r),                      // %0
         "+r"(src_g),                      // %1
@@ -773,8 +663,8 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v0.16b}, [%3], #16           \n"  // store B
       "st1         {v1.16b}, [%2], #16           \n"  // store G
       "st1         {v2.16b}, [%1], #16           \n"  // store R
@@ -804,11 +694,11 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
       "ld1         {v0.16b}, [%2], #16           \n"  // load B
       "ld1         {v3.16b}, [%3], #16           \n"  // load A
+      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
       "prfm        pldl1keep, [%0, 448]          \n"
       "prfm        pldl1keep, [%1, 448]          \n"
       "prfm        pldl1keep, [%2, 448]          \n"
       "prfm        pldl1keep, [%3, 448]          \n"
-      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
       "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
       "b.gt        1b                            \n"
       : "+r"(src_r),                            // %0
@@ -831,8 +721,8 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v0.16b}, [%3], #16           \n"  // store B
       "st1         {v1.16b}, [%2], #16           \n"  // store G
       "st1         {v2.16b}, [%1], #16           \n"  // store R
@@ -859,10 +749,10 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
       "ld1         {v2.16b}, [%0], #16           \n"  // load R
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
       "ld1         {v0.16b}, [%2], #16           \n"  // load B
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
       "prfm        pldl1keep, [%0, 448]          \n"
       "prfm        pldl1keep, [%1, 448]          \n"
       "prfm        pldl1keep, [%2, 448]          \n"
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
       "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n"  // store 16ARGB
       "b.gt        1b                            \n"
       : "+r"(src_r),                            // %0
@@ -875,6 +765,240 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
   );
 }
 
+void MergeXR30Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width) {
+  int shift = 10 - depth;
+  asm volatile(
+      "movi        v30.16b, #255                 \n"
+      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
+      "dup         v31.4s, %w5                   \n"
+      "1:                                        \n"
+      "ldr         d2, [%2], #8                  \n"  // B
+      "ldr         d1, [%1], #8                  \n"  // G
+      "ldr         d0, [%0], #8                  \n"  // R
+      "ushll       v2.4s, v2.4h, #0              \n"  // B
+      "ushll       v1.4s, v1.4h, #0              \n"  // G
+      "ushll       v0.4s, v0.4h, #0              \n"  // R
+      "ushl        v2.4s, v2.4s, v31.4s          \n"  // 000B
+      "ushl        v1.4s, v1.4s, v31.4s          \n"  // G
+      "ushl        v0.4s, v0.4s, v31.4s          \n"  // R
+      "umin        v2.4s, v2.4s, v30.4s          \n"
+      "umin        v1.4s, v1.4s, v30.4s          \n"
+      "umin        v0.4s, v0.4s, v30.4s          \n"
+      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
+      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
+      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
+      "subs        %w4, %w4, #4                  \n"
+      "str         q2, [%3], #16                 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+                          const uint16_t* src_g,
+                          const uint16_t* src_b,
+                          uint8_t* dst_ar30,
+                          int /* depth */,
+                          int width) {
+  asm volatile(
+      "movi        v30.16b, #255                 \n"
+      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
+      "1:                                        \n"
+      "ldr         d2, [%2], #8                  \n"  // B
+      "ldr         d1, [%1], #8                  \n"  // G
+      "ldr         d0, [%0], #8                  \n"  // R
+      "ushll       v2.4s, v2.4h, #0              \n"  // 000B
+      "ushll       v1.4s, v1.4h, #0              \n"  // G
+      "ushll       v0.4s, v0.4h, #0              \n"  // R
+      "umin        v2.4s, v2.4s, v30.4s          \n"
+      "umin        v1.4s, v1.4s, v30.4s          \n"
+      "umin        v0.4s, v0.4s, v30.4s          \n"
+      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
+      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
+      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
+      "subs        %w4, %w4, #4                  \n"
+      "str         q2, [%3], #16                 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v30");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "dup         v30.8h, %w7                   \n"
+      "dup         v31.8h, %w6                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "ldr         q3, [%3], #16                 \n"  // A
+      "umin        v2.8h, v2.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umin        v1.8h, v1.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umin        v0.8h, v0.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umin        v3.8h, v3.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "ushl        v3.8h, v3.8h, v31.8h          \n"
+      "subs        %w5, %w5, #8                  \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_ar64),  // %4
+        "+r"(width)      // %5
+      : "r"(shift),      // %6
+        "r"(mask)        // %7
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "movi        v3.16b, #0xff                 \n"  // A (0xffff)
+      "dup         v30.8h, %w6                   \n"
+      "dup         v31.8h, %w5                   \n"
+
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "umin        v2.8h, v2.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umin        v1.8h, v1.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umin        v0.8h, v0.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar64),  // %3
+        "+r"(width)      // %4
+      : "r"(shift),      // %5
+        "r"(mask)        // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "dup         v31.8h, %w6                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "ldr         q3, [%3], #16                 \n"  // A
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v3.8h, v3.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "uqxtn       v3.8b, v3.8h                  \n"
+      "subs        %w5, %w5, #8                  \n"
+      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : "r"(shift)       // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "dup         v31.8h, %w5                   \n"
+      "movi        v3.8b, #0xff                  \n"  // A (0xff)
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
 // Copy multiple of 32.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
@@ -1072,10 +1196,10 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
       "movi        v5.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "orr         v3.8b, v1.8b, v1.8b           \n"  // move g
-      "orr         v4.8b, v0.8b, v0.8b           \n"  // move r
+      "orr         v4.8b, v0.8b, v0.8b           \n"         // move r
       "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
       "b.gt        1b                            \n"
       : "+r"(src_raw),   // %0
@@ -1091,10 +1215,10 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
       "movi        v0.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v2.8b, v4.8b, v4.8b           \n"   // move g
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "orr         v2.8b, v4.8b, v4.8b           \n"  // move g
-      "orr         v1.8b, v5.8b, v5.8b           \n"  // move r
+      "orr         v1.8b, v5.8b, v5.8b           \n"         // move r
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
       "b.gt        1b                            \n"
       : "+r"(src_raw),   // %0
@@ -1109,9 +1233,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   asm volatile(
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
       "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
+      "prfm        pldl1keep, [%0, 448]          \n"
       "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
       "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
       "b.gt        1b                            \n"
@@ -1143,9 +1267,8 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
       "movi        v3.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      RGB565TOARGB
+      "prfm        pldl1keep, [%0, 448]          \n" RGB565TOARGB
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_rgb565),  // %0
@@ -1233,9 +1356,8 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
   asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGB4444TOARGB
+      "prfm        pldl1keep, [%0, 448]          \n" ARGB4444TOARGB
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb4444),  // %0
@@ -1252,8 +1374,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
   asm volatile(
       "1:                                        \n"
       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
       "st3         {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
                                                        // RGB24
       "b.gt        1b                            \n"
@@ -1269,9 +1391,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
   asm volatile(
       "1:                                        \n"
       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "orr         v4.8b, v2.8b, v2.8b           \n"  // mov g
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "orr         v4.8b, v2.8b, v2.8b           \n"   // mov g
       "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
       "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
       "b.gt        1b                            \n"
@@ -1287,8 +1409,8 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
       "b.gt        1b                            \n"
       : "+r"(src_yuy2),  // %0
@@ -1303,8 +1425,8 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
       "b.gt        1b                            \n"
       : "+r"(src_uyvy),  // %0
@@ -1322,8 +1444,8 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
       "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
       "b.gt        1b                            \n"
@@ -1343,8 +1465,8 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
       "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
       "b.gt        1b                            \n"
@@ -1366,10 +1488,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
       "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
+      "prfm        pldl1keep, [%0, 448]          \n"
       "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
       "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
       "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
@@ -1394,10 +1516,10 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
       "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
+      "prfm        pldl1keep, [%0, 448]          \n"
       "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
       "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
@@ -1422,8 +1544,8 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
       "ld1         {v2.16b}, [%3]                \n"  // shuffler
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
       "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
       "st1         {v1.16b}, [%1], #16           \n"  // store 4.
       "b.gt        1b                            \n"
@@ -1443,11 +1565,11 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
   asm volatile(
       "1:                                        \n"
       "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels
       "orr         v2.8b, v1.8b, v1.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
       "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
-      "subs        %w4, %w4, #16                 \n"         // 16 pixels
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
       "b.gt        1b                            \n"
       : "+r"(src_y),     // %0
@@ -1467,8 +1589,8 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
   asm volatile(
       "1:                                        \n"
       "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
-      "prfm        pldl1keep, [%0, 448]          \n"
       "orr         v3.8b, v2.8b, v2.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
       "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
       "subs        %w4, %w4, #16                 \n"         // 16 pixels
@@ -1488,18 +1610,17 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGBTORGB565
-      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels RGB565.
+      "prfm        pldl1keep, [%0, 448]          \n" ARGBTORGB565
+      "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
       : "+r"(src_argb),    // %0
         "+r"(dst_rgb565),  // %1
         "+r"(width)        // %2
       :
-      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+      : "cc", "memory", "v16", "v17", "v18", "v19");
 }
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
@@ -1509,20 +1630,20 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
   asm volatile(
       "dup         v1.4s, %w2                    \n"  // dither4
       "1:                                        \n"
-      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // load 8
                                                                  // pixels
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqadd       v20.8b, v20.8b, v1.8b         \n"
-      "uqadd       v21.8b, v21.8b, v1.8b         \n"
-      "uqadd       v22.8b, v22.8b, v1.8b         \n" ARGBTORGB565
-      "st1         {v0.16b}, [%0], #16           \n"  // store 8 pixels RGB565.
+      "uqadd       v16.8b, v16.8b, v1.8b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqadd       v17.8b, v17.8b, v1.8b         \n"
+      "uqadd       v18.8b, v18.8b, v1.8b         \n" ARGBTORGB565
+      "st1         {v18.16b}, [%0], #16          \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
       : "+r"(dst_rgb)   // %0
       : "r"(src_argb),  // %1
         "r"(dither4),   // %2
         "r"(width)      // %3
-      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+      : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
 }
 
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
@@ -1530,39 +1651,131 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             int width) {
   asm volatile(
       "1:                                        \n"
-      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGBTOARGB1555
+      "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB1555
       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
       "b.gt        1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb1555),  // %1
         "+r"(width)          // %2
       :
-      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+      : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
 }
 
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
   asm volatile(
-      "movi        v4.16b, #0x0f                 \n"  // bits to clear with
+      "movi        v23.16b, #0x0f                \n"  // bits to clear with
                                                       // vbic.
       "1:                                        \n"
-      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGBTOARGB4444
+      "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB4444
       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
       "b.gt        1b                            \n"
       : "+r"(src_argb),      // %0
         "+r"(dst_argb4444),  // %1
         "+r"(width)          // %2
       :
-      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
+}
+
+static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
+                                         10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
+      "mov         v1.16b, v0.16b                \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mov         v3.16b, v2.16b                \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
+      "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+      "ld1         {v4.16b}, %3                  \n"  // shuffler
+      "1:                                        \n"
+      "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
+      "tbl         v0.16b, {v0.16b}, v4.16b      \n"
+      "tbl         v2.16b, {v2.16b}, v4.16b      \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mov         v1.16b, v0.16b                \n"
+      "mov         v3.16b, v2.16b                \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
+      "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_ab64),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+static const uvec8 kShuffleAR64ToARGB = {1,  3,  5,  7,  9,  11, 13, 15,
+                                         17, 19, 21, 23, 25, 27, 29, 31};
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "ld1         {v4.16b}, %3                  \n"  // shuffler
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
+      "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
+      "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ar64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAR64ToARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5,  3,  1,  7,  13, 11, 9,  15,
+                                         21, 19, 17, 23, 29, 27, 25, 31};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "ld1         {v4.16b}, %3                  \n"  // shuffler
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
+      "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
+      "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAB64ToARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1573,9 +1786,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -1614,9 +1827,9 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -1629,22 +1842,22 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
-void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   asm volatile(
       "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
       "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
       "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 RGBA
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v0.8h, v1.8b, v4.8b           \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v0.8h, v2.8b, v5.8b           \n"  // G
       "umlal       v0.8h, v3.8b, v6.8b           \n"  // R
       "uqrshrn     v3.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
       "st1         {v3.8b}, [%1], #8             \n"  // store 8 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
+      : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       :
@@ -1666,9 +1879,9 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
       "movi        v29.16b,#0x80                 \n"  // 128.5
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
       "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
       "add         v4.8h, v4.8h, v29.8h          \n"  // +128 -> unsigned
@@ -1729,14 +1942,14 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
     RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
 
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -1775,13 +1988,13 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -1815,13 +2028,13 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
     RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -1855,13 +2068,13 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
     RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -1895,13 +2108,13 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
     RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -1935,13 +2148,13 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
     RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -1975,13 +2188,13 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
     RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
 
@@ -2016,9 +2229,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
       RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       RGB565TOARGB
       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
@@ -2028,9 +2241,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
 
       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
-      "prfm        pldl1keep, [%1, 448]          \n"
       RGB565TOARGB
       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
@@ -2074,9 +2287,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
       RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       RGB555TOARGB
       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
@@ -2086,9 +2299,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
 
       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
-      "prfm        pldl1keep, [%1, 448]          \n"
       RGB555TOARGB
       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
@@ -2132,9 +2345,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
       RGBTOUV_SETUP_REG  // sets v20-v25
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       ARGB4444TOARGB
       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
@@ -2144,9 +2357,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
 
       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
-      "prfm        pldl1keep, [%1, 448]          \n"
       ARGB4444TOARGB
       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
@@ -2189,10 +2402,10 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
       "movi        v27.8b, #16                   \n"  // Add 16 constant
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       RGB565TOARGB
       "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
       "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -2217,10 +2430,10 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGB1555TOARGB
       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -2244,10 +2457,10 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
       "movi        v27.8b, #16                   \n"  // Add 16 constant
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       ARGB4444TOARGB
       "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
       "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -2269,9 +2482,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v16.8h, v1.8b, v4.8b          \n"  // R
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
       "umlal       v16.8h, v3.8b, v6.8b          \n"  // B
       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
@@ -2293,9 +2506,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v16.8h, v0.8b, v4.8b          \n"  // R
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
       "umlal       v16.8h, v2.8b, v6.8b          \n"  // B
       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
@@ -2317,9 +2530,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v16.8h, v1.8b, v4.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
       "umlal       v16.8h, v3.8b, v6.8b          \n"  // R
       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
@@ -2341,9 +2554,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"   // B
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
       "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
       "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
@@ -2365,9 +2578,9 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
       "movi        v7.8b, #16                    \n"  // Add 16 constant
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"   // B
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
       "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
       "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
@@ -2388,9 +2601,9 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
       "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"   // B
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v0.8h, v0.8b, v4.8b           \n"  // B
       "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
       "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
       "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -2410,9 +2623,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
       "movi        v4.8b, #77                    \n"  // R * 0.2990 coefficient
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"   // B
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v0.8h, v0.8b, v4.8b           \n"  // B
       "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
       "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
       "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
@@ -2446,11 +2659,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
       "1:                                        \n"
       "ld1         {v0.16b}, [%1], #16           \n"
       "ld1         {v1.16b}, [%2], #16           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
       "subs        %w3, %w3, #16                 \n"
       "umull       v2.8h, v0.8b,  v4.8b          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "umull2      v3.8h, v0.16b, v4.16b         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
       "umlal       v2.8h, v1.8b,  v5.8b          \n"
       "umlal2      v3.8h, v1.16b, v5.16b         \n"
       "rshrn       v0.8b,  v2.8h, #8             \n"
@@ -2463,10 +2676,10 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
       "50:                                       \n"
       "ld1         {v0.16b}, [%1], #16           \n"
       "ld1         {v1.16b}, [%2], #16           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
       "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
       "st1         {v0.16b}, [%0], #16           \n"
       "b.gt        50b                           \n"
       "b           99f                           \n"
@@ -2474,8 +2687,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
       // Blend 100 / 0 - Copy row unchanged.
       "100:                                      \n"
       "ld1         {v0.16b}, [%1], #16           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "st1         {v0.16b}, [%0], #16           \n"
       "b.gt        100b                          \n"
 
@@ -2491,7 +2704,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
@@ -2502,11 +2715,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
       "8:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "prfm        pldl1keep, [%1, 448]          \n"
       "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
       "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
       "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
@@ -2532,11 +2745,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
                                                            // ARGB0.
       "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
                                                            // ARGB1.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
       "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "prfm        pldl1keep, [%1, 448]          \n"
       "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
       "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
       "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
@@ -2553,7 +2766,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
 
       "99:                                       \n"
 
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2570,14 +2783,14 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
       // Attenuate 8 pixels.
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
-      "umull       v5.8h, v1.8b, v3.8b           \n"  // g * a
-      "umull       v6.8h, v2.8b, v3.8b           \n"  // r * a
-      "uqrshrn     v0.8b, v4.8h, #8              \n"  // b >>= 8
-      "uqrshrn     v1.8b, v5.8h, #8              \n"  // g >>= 8
-      "uqrshrn     v2.8b, v6.8h, #8              \n"  // r >>= 8
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
+      "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
+      "uqrshrn     v0.8b, v4.8h, #8              \n"         // b >>= 8
+      "uqrshrn     v1.8b, v5.8h, #8              \n"         // g >>= 8
+      "uqrshrn     v2.8b, v6.8h, #8              \n"         // r >>= 8
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
@@ -2603,9 +2816,9 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
       // 8 pixel loop.
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
+      "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
+      "uxtl        v0.8h, v0.8b                  \n"    // b (0 .. 255)
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
-      "uxtl        v0.8h, v0.8b                  \n"  // b (0 .. 255)
       "uxtl        v1.8h, v1.8b                  \n"
       "uxtl        v2.8h, v2.8b                  \n"
       "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
@@ -2645,9 +2858,9 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
       // 8 pixel loop.
       "1:                                        \n"
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uxtl        v5.8h, v5.8b                  \n"
       "uxtl        v6.8h, v6.8b                  \n"
       "uxtl        v7.8h, v7.8b                  \n"
@@ -2678,9 +2891,9 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
       "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
       "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
       "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
@@ -2713,9 +2926,9 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
       "movi        v30.8b, #50                   \n"  // BR coefficient
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v20.8b          \n"    // B to Sepia B
       "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
-      "umull       v4.8h, v0.8b, v20.8b          \n"  // B to Sepia B
       "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
       "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
       "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
@@ -2750,9 +2963,9 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
 
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uxtl        v17.8h, v17.8b                \n"  // g
       "uxtl        v18.8h, v18.8b                \n"  // r
       "uxtl        v19.8h, v19.8b                \n"  // a
@@ -2800,7 +3013,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
 
 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -2809,11 +3022,11 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
+      "prfm        pldl1keep, [%0, 448]          \n"
       "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
+      "prfm        pldl1keep, [%1, 448]          \n"
       "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
       "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
       "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
@@ -2822,7 +3035,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
       "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2831,7 +3044,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
@@ -2840,16 +3053,16 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "uqadd       v0.8b, v0.8b, v4.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uqadd       v1.8b, v1.8b, v5.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uqadd       v2.8b, v2.8b, v6.8b           \n"
       "uqadd       v3.8b, v3.8b, v7.8b           \n"
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2858,7 +3071,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
@@ -2867,16 +3080,16 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "uqsub       v0.8b, v0.8b, v4.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uqsub       v1.8b, v1.8b, v5.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uqsub       v2.8b, v2.8b, v6.8b           \n"
       "uqsub       v3.8b, v3.8b, v7.8b           \n"
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
-      : "+r"(src_argb0),  // %0
+      : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
         "+r"(dst_argb),   // %2
         "+r"(width)       // %3
@@ -2899,11 +3112,11 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
       "1:                                        \n"
       "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
       "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
+      "prfm        pldl1keep, [%0, 448]          \n"
       "orr         v1.8b, v0.8b, v0.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "orr         v2.8b, v0.8b, v0.8b           \n"
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
@@ -2925,10 +3138,10 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
       "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
+      "prfm        pldl1keep, [%1, 448]          \n"
       "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
       "b.gt        1b                            \n"
       : "+r"(src_sobelx),  // %0
@@ -2954,10 +3167,10 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
       "1:                                        \n"
       "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
       "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
+      "prfm        pldl1keep, [%1, 448]          \n"
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_sobelx),  // %0
@@ -2981,18 +3194,18 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       "1:                                        \n"
       "ld1         {v0.8b}, [%0],%5              \n"  // top
       "ld1         {v1.8b}, [%0],%6              \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
       "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
       "ld1         {v3.8b}, [%1],%6              \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "add         v0.8h, v0.8h, v1.8h           \n"
       "add         v0.8h, v0.8h, v1.8h           \n"
       "ld1         {v2.8b}, [%2],%5              \n"  // bottom
       "ld1         {v3.8b}, [%2],%6              \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
       "subs        %w4, %w4, #8                  \n"  // 8 pixels
+      "prfm        pldl1keep, [%2, 448]          \n"
       "usubl       v1.8h, v2.8b, v3.8b           \n"
       "add         v0.8h, v0.8h, v1.8h           \n"
       "abs         v0.8h, v0.8h                  \n"
@@ -3030,11 +3243,11 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       "add         v0.8h, v0.8h, v1.8h           \n"
       "ld1         {v2.8b}, [%0],%5              \n"  // right
       "ld1         {v3.8b}, [%1],%5              \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
       "subs        %w3, %w3, #8                  \n"  // 8 pixels
       "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "add         v0.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
       "abs         v0.8h, v0.8h                  \n"
       "uqxtn       v0.8b, v0.8h                  \n"
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
@@ -3057,9 +3270,9 @@ void HalfFloat1Row_NEON(const uint16_t* src,
   asm volatile(
       "1:                                        \n"
       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
       "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uxtl2       v3.4s, v1.8h                  \n"
       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
       "scvtf       v3.4s, v3.4s                  \n"
@@ -3081,9 +3294,9 @@ void HalfFloatRow_NEON(const uint16_t* src,
   asm volatile(
       "1:                                        \n"
       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
       "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uxtl2       v3.4s, v1.8h                  \n"
       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
       "scvtf       v3.4s, v3.4s                  \n"
@@ -3107,9 +3320,9 @@ void ByteToFloatRow_NEON(const uint8_t* src,
   asm volatile(
       "1:                                        \n"
       "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
       "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
       "uxtl2       v3.4s, v1.8h                  \n"
       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
@@ -3136,9 +3349,9 @@ float ScaleMaxSamples_NEON(const float* src,
 
       "1:                                        \n"
       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
       "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "prfm        pldl1keep, [%0, 448]          \n"
       "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
       "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
       "fmax        v6.4s, v6.4s, v2.4s           \n"
@@ -3166,9 +3379,9 @@ float ScaleSumSamples_NEON(const float* src,
 
       "1:                                        \n"
       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
       "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "prfm        pldl1keep, [%0, 448]          \n"
       "fmul        v4.4s, v2.4s, %4.s[0]         \n"
       "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
       "fmla        v6.4s, v2.4s, v2.4s           \n"
@@ -3376,10 +3589,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
       "1:                                        \n"
       "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
       "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
+      "zip1        v0.16b, v0.16b, v0.16b        \n"  // replicate V values
       "prfm        pldl1keep, [%0, 448]          \n"
+      "zip1        v1.16b, v1.16b, v1.16b        \n"  // replicate U values
       "prfm        pldl1keep, [%1, 448]          \n"
-      "zip1        v0.16b, v0.16b, v0.16b        \n"      // replicate V values
-      "zip1        v1.16b, v1.16b, v1.16b        \n"      // replicate U values
       "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
       "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
       "b.gt        1b                            \n"
@@ -3391,6 +3604,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
       : "cc", "memory", "v0", "v1", "v2");
 }
 
+// AYUV is YVUA in memory.  UV for NV12 is UV order in memory.
 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                       int src_stride_ayuv,
                       uint8_t* dst_uv,
@@ -3400,12 +3614,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
 
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
       "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
       "uqrshrn     v2.8b, v1.8h, #2              \n"
@@ -3429,12 +3643,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
 
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
-      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
       "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
       "uqrshrn     v1.8b, v1.8h, #2              \n"
@@ -3454,8 +3668,8 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
   asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
       "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
       "b.gt        1b                            \n"
       : "+r"(src_ayuv),  // %0
@@ -3476,9 +3690,9 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
       "ld1         {v1.16b}, [%0], 16            \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
       "tbl         v0.16b, {v0.16b}, v2.16b      \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "tbl         v1.16b, {v1.16b}, v2.16b      \n"
       "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
       "b.gt        1b                            \n"
@@ -3531,34 +3745,24 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         uint16_t* dst_v,
                         int depth,
                         int width) {
+  int shift = depth - 16;  // Negative for right shift.
   asm volatile(
-      "dup         v0.4s, %w3                    \n"
+      "dup         v2.8h, %w4                    \n"
       "1:                                        \n"
-      "ld2         {v1.8h, v2.8h}, [%0], #32     \n"  // load 8 UV
+      "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "ushl        v0.8h, v0.8h, v2.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "ushll       v3.4s, v1.4h, #0              \n"
-      "ushll2      v4.4s, v1.8h, #0              \n"
-      "ushl        v3.4s, v3.4s, v0.4s           \n"
-      "ushl        v4.4s, v4.4s, v0.4s           \n"
-      "xtn         v1.4h, v3.4s                  \n"
-      "xtn2        v1.8h, v4.4s                  \n"
-      "ushll       v3.4s, v2.4h, #0              \n"
-      "ushll2      v4.4s, v2.8h, #0              \n"
-      "ushl        v3.4s, v3.4s, v0.4s           \n"
-      "ushl        v4.4s, v4.4s, v0.4s           \n"
-      "xtn         v2.4h, v3.4s                  \n"
-      "xtn2        v2.8h, v4.4s                  \n"
-      "subs        %w4, %w4, #8                  \n"  // 8 src pixels per loop
-      "st1         {v1.8h}, [%1], #16            \n"  // store 8 U pixels
-      "st1         {v2.8h}, [%2], #16            \n"  // store 8 V pixels
+      "ushl        v1.8h, v1.8h, v2.8h           \n"
+      "st1         {v0.8h}, [%1], #16            \n"  // store 8 U pixels
+      "st1         {v1.8h}, [%2], #16            \n"  // store 8 V pixels
       "b.gt        1b                            \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_u),   // %1
         "+r"(dst_v),   // %2
-        "+r"(depth),   // %3
-        "+r"(width)    // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
+      : "cc", "memory", "v0", "v1", "v2");
 }
 
 void MergeUVRow_16_NEON(const uint16_t* src_u,
@@ -3568,23 +3772,22 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int width) {
   int shift = 16 - depth;
   asm volatile(
-      "dup         v2.8h, %w3                    \n"
+      "dup         v2.8h, %w4                    \n"
       "1:                                        \n"
       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
       "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
-      "prfm        pldl1keep, [%1, 448]          \n"
       "ushl        v0.8h, v0.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "ushl        v1.8h, v1.8h, v2.8h           \n"
-      "subs        %w4, %w4, #8                  \n"  // 8 src pixels per loop
+      "prfm        pldl1keep, [%1, 448]          \n"
       "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store 8 UV pixels
       "b.gt        1b                            \n"
       : "+r"(src_u),   // %0
         "+r"(src_v),   // %1
         "+r"(dst_uv),  // %2
-        "+r"(shift),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
       : "cc", "memory", "v0", "v1", "v2");
 }
 
@@ -3595,10 +3798,9 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
   asm volatile(
       "dup         v2.8h, %w2                    \n"
       "1:                                        \n"
-      "ldp         q0, q1, [%0]                  \n"
-      "add         %0, %0, #32                   \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "ldp         q0, q1, [%0], #32             \n"
       "mul         v0.8h, v0.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "mul         v1.8h, v1.8h, v2.8h           \n"
       "stp         q0, q1, [%1]                  \n"  // store 16 pixels
       "add         %1, %1, #32                   \n"
@@ -3619,11 +3821,10 @@ void DivideRow_16_NEON(const uint16_t* src_y,
   asm volatile(
       "dup         v0.8h, %w2                    \n"
       "1:                                        \n"
-      "ldp         q1, q2, [%0]                  \n"
-      "add         %0, %0, #32                   \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "ldp         q1, q2, [%0], #32             \n"
       "ushll       v3.4s, v1.4h, #0              \n"
       "ushll       v4.4s, v2.4h, #0              \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
       "ushll2      v1.4s, v1.8h, #0              \n"
       "ushll2      v2.4s, v2.8h, #0              \n"
       "mul         v3.4s, v0.4s, v3.4s           \n"
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 951518926f..5203b57c69 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -10,9 +10,9 @@
 
 #include "libyuv/row.h"
 
-// This module is for Visual C 32/64 bit and clangcl 32 bit
+// This module is for Visual C 32/64 bit
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
 
 #if defined(_M_X64)
 #include <emmintrin.h>
@@ -29,9 +29,9 @@ extern "C" {
 
 // Read 8 UV from 444
 #define READYUV444                                    \
-  xmm0 = _mm_loadl_epi64((__m128i*)u_buf);            \
+  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
   xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);               \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
   u_buf += 8;                                         \
   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
@@ -39,9 +39,9 @@ extern "C" {
 
 // Read 8 UV from 444, With 8 Alpha.
 #define READYUVA444                                   \
-  xmm0 = _mm_loadl_epi64((__m128i*)u_buf);            \
+  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
   xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);               \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
   u_buf += 8;                                         \
   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
@@ -51,10 +51,10 @@ extern "C" {
 
 // Read 4 UV from 422, upsample to 8 UV.
 #define READYUV422                                        \
-  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
+  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
   u_buf += 4;                                             \
   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
@@ -62,10 +62,10 @@ extern "C" {
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422                                       \
-  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
+  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
   u_buf += 4;                                             \
   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
@@ -74,24 +74,21 @@ extern "C" {
   a_buf += 8;
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                     \
-  xmm1 = _mm_loadu_si128(&xmm0);                                   \
-  xmm2 = _mm_loadu_si128(&xmm0);                                   \
-  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
-  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
-  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
-  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
-  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
-  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
-  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
-  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
-  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
-  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
-  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
-  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
-  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
-  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
-  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+#define YUVTORGB(yuvconstants)                                      \
+  xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80));                   \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);   \
+  xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
+  xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3);  \
+  xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3);  \
+  xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3);  \
+  xmm0 = _mm_adds_epi16(xmm4, xmm0);                                \
+  xmm1 = _mm_subs_epi16(xmm4, xmm1);                                \
+  xmm2 = _mm_adds_epi16(xmm4, xmm2);                                \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                   \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                   \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                   \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                              \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                              \
   xmm2 = _mm_packus_epi16(xmm2, xmm2);
 
 // Store 8 ARGB values.
@@ -112,7 +109,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                          uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4;
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
@@ -132,7 +129,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                               uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUVA422
@@ -150,7 +147,7 @@ void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
                          uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4;
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
@@ -170,7 +167,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                               uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUVA444
@@ -247,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
 // 7 bit fixed point 0.5.
 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+// 8 bit fixed point 0.5, for bias of UV.
+static const ulvec8 kBiasUV128 = {
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
@@ -1427,7 +1424,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
   }
 }
 
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1440,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
     sub        edi, edx  // stride from u to v
@@ -1499,7 +1496,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                                           int src_stride_argb,
                                           uint8_t* dst_u,
                                           uint8_t* dst_v,
@@ -1512,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kARGBToVJ
     movdqa     xmm7, xmmword ptr kARGBToUJ
     sub        edi, edx  // stride from u to v
@@ -1573,7 +1570,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                                         int src_stride_argb,
                                         uint8_t* dst_u,
                                         uint8_t* dst_v,
@@ -1586,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
     sub        edi, edx   // stride from u to v
@@ -1641,7 +1638,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1654,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
     vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
     sub        edi, edx   // stride from u to v
@@ -1709,7 +1706,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                                             uint8_t* dst_u,
                                             uint8_t* dst_v,
                                             int width) {
@@ -1719,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 4 + 8]  // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
     sub        edi, edx    // stride from u to v
@@ -1767,7 +1764,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1780,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kBGRAToV
     movdqa     xmm7, xmmword ptr kBGRAToU
     sub        edi, edx  // stride from u to v
@@ -1839,7 +1836,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1852,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kABGRToV
     movdqa     xmm7, xmmword ptr kABGRToU
     sub        edi, edx  // stride from u to v
@@ -1911,7 +1908,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1924,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kRGBAToV
     movdqa     xmm7, xmmword ptr kRGBAToU
     sub        edi, edx  // stride from u to v
@@ -1986,14 +1983,14 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
 
 // Read 16 UV from 444
 #define READYUV444_AVX2 \
-  __asm {                                                \
-    __asm vmovdqu    xmm0, [esi] /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* U */                                       \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
@@ -2001,12 +1998,12 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
 // Read 16 UV from 444.  With 16 Alpha.
 #define READYUVA444_AVX2 \
   __asm {                                                                      \
-    __asm vmovdqu    xmm0, [esi] /* U */                                       \
+    __asm vmovdqu    xmm3, [esi] /* U */                                       \
     __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                                 \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
@@ -2017,123 +2014,122 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
 
 // Read 8 UV from 422, upsample to 16 UV.
 #define READYUV422_AVX2 \
-  __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+  __asm {                                                                      \
+    __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
 #define READYUVA422_AVX2 \
-  __asm {                                               \
-    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+  __asm {                                                                      \
+    __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp] /* A */                      \
+    __asm vmovdqu    xmm5, [ebp] /* A */                                       \
     __asm vpermq     ymm5, ymm5, 0xd8                                          \
     __asm lea        ebp, [ebp + 16]}
 
 // Read 8 UV from NV12, upsample to 16 UV.
 #define READNV12_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* UV */                                      \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from NV21, upsample to 16 UV.
 #define READNV21_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* UV */                                      \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
 #define READYUY2_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
+  __asm {                                                                      \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                                    \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm0, [eax] /* UV */                             \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
+    __asm vmovdqu    ymm3, [eax] /* UV */                                      \
+    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleYUY2UV                    \
     __asm lea        eax, [eax + 32]}
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
 #define READUYVY_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
+  __asm {                                                                      \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                                    \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm0, [eax] /* UV */                             \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
+    __asm vmovdqu    ymm3, [eax] /* UV */                                      \
+    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleUYVYUV                    \
     __asm lea        eax, [eax + 32]}
 
 // Convert 16 pixels: 16 UV and 16 Y.
 #define YUVTORGB_AVX2(YuvConstants) \
-  __asm {                                    \
-    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
-    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
-    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
-    __asm vpsubw     ymm2, ymm3, ymm2                                          \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
-    __asm vpsubw     ymm1, ymm3, ymm1                                          \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+  __asm {                                                                      \
+    __asm vpsubb     ymm3, ymm3, ymmword ptr kBiasUV128                        \
     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
+    __asm vmovdqa    ymm0, ymmword ptr [YuvConstants + KUVTOB]                 \
+    __asm vmovdqa    ymm1, ymmword ptr [YuvConstants + KUVTOG]                 \
+    __asm vmovdqa    ymm2, ymmword ptr [YuvConstants + KUVTOR]                 \
+    __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */                               \
+    __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */                               \
+    __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */                               \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KYBIASTORGB]            \
+    __asm vpaddw     ymm4, ymm3, ymm4                                          \
+    __asm vpaddsw    ymm0, ymm0, ymm4                                          \
+    __asm vpsubsw    ymm1, ymm4, ymm1                                          \
+    __asm vpaddsw    ymm2, ymm2, ymm4                                          \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
-  }
+    __asm vpackuswb  ymm0, ymm0, ymm0                                          \
+    __asm vpackuswb  ymm1, ymm1, ymm1                                          \
+    __asm vpackuswb  ymm2, ymm2, ymm2}
 
 // Store 16 ARGB values.
 #define STOREARGB_AVX2 \
-  __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
+  __asm {                                                                      \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                                 \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                                 \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */                \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */                 \
     __asm vmovdqu    0[edx], ymm1                                              \
     __asm vmovdqu    32[edx], ymm0                                             \
     __asm lea        edx,  [edx + 64]}
 
 // Store 16 RGBA values.
 #define STORERGBA_AVX2 \
-  __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
+  __asm {                                                                      \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                                 \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                                 \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */                \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */                 \
     __asm vmovdqu    [edx], ymm0                                               \
     __asm vmovdqu    [edx + 32], ymm1                                          \
     __asm lea        edx,  [edx + 64]}
@@ -2480,11 +2476,11 @@ __declspec(naked) void I422ToRGBARow_AVX2(
 
 // Read 8 UV from 444.
 #define READYUV444 \
-  __asm {                                                     \
-    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* U */                             \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
@@ -2492,10 +2488,10 @@ __declspec(naked) void I422ToRGBARow_AVX2(
 // Read 4 UV from 444.  With 8 Alpha.
 #define READYUVA444 \
   __asm {                                                                      \
-    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+    __asm movq       xmm3, qword ptr [esi] /* U */                             \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                                       \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]                                            \
@@ -2504,180 +2500,178 @@ __declspec(naked) void I422ToRGBARow_AVX2(
 
 // Read 4 UV from 422, upsample to 8 UV.
 #define READYUV422 \
-  __asm {                                                     \
-    __asm movd       xmm0, [esi] /* U */                              \
-    __asm movd       xmm1, [esi + edi] /* V */                              \
+  __asm {                                                                      \
+    __asm movd       xmm3, [esi] /* U */                                       \
+    __asm movd       xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
+    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422 \
-  __asm {                                                    \
-    __asm movd       xmm0, [esi] /* U */                              \
-    __asm movd       xmm1, [esi + edi] /* V */                              \
+  __asm {                                                                      \
+    __asm movd       xmm3, [esi] /* U */                                       \
+    __asm movd       xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
+    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                             \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm movq       xmm5, qword ptr [ebp] /* A */                             \
     __asm lea        ebp, [ebp + 8]}
 
 // Read 4 UV from NV12, upsample to 8 UV.
 #define READNV12 \
-  __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
 // Read 4 VU from NV21, upsample to 8 UV.
 #define READNV21 \
-  __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm pshufb     xmm3, xmmword ptr kShuffleNV21                            \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
 #define READYUY2 \
-  __asm {                                                       \
-    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
+  __asm {                                                                      \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                                    \
     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm0, [eax] /* UV */                             \
-    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
+    __asm movdqu     xmm3, [eax] /* UV */                                      \
+    __asm pshufb     xmm3, xmmword ptr kShuffleYUY2UV                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
 #define READUYVY \
-  __asm {                                                       \
-    __asm movdqu     xmm4, [eax] /* UYVY */                           \
+  __asm {                                                                      \
+    __asm movdqu     xmm4, [eax] /* UYVY */                                    \
     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm0, [eax] /* UV */                             \
-    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
+    __asm movdqu     xmm3, [eax] /* UV */                                      \
+    __asm pshufb     xmm3, xmmword ptr kShuffleUYVYUV                          \
     __asm lea        eax, [eax + 16]}
 
 // Convert 8 pixels: 8 UV and 8 Y.
 #define YUVTORGB(YuvConstants) \
-  __asm {                                         \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm movdqa     xmm3, xmm0                                                \
-    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
-    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
-    __asm psubw      xmm0, xmm1                                                \
-    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
-    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
-    __asm psubw      xmm1, xmm2                                                \
-    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
-    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
-    __asm psubw      xmm2, xmm3                                                \
+  __asm {                                                                      \
+    __asm psubb      xmm3, xmmword ptr kBiasUV128                              \
     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
-    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
-    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
+    __asm pmaddubsw  xmm0, xmm3                                                \
+    __asm pmaddubsw  xmm1, xmm3                                                \
+    __asm pmaddubsw  xmm2, xmm3                                                \
+    __asm movdqa     xmm3, xmmword ptr [YuvConstants + KYBIASTORGB]            \
+    __asm paddw      xmm4, xmm3                                                \
+    __asm paddsw     xmm0, xmm4                                                \
+    __asm paddsw     xmm2, xmm4                                                \
+    __asm psubsw     xmm4, xmm1                                                \
+    __asm movdqa     xmm1, xmm4                                                \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0 /* B */                              \
-    __asm packuswb   xmm1, xmm1 /* G */                              \
-    __asm packuswb   xmm2, xmm2 /* R */             \
+    __asm packuswb   xmm0, xmm0 /* B */                                        \
+    __asm packuswb   xmm1, xmm1 /* G */                                        \
+    __asm packuswb   xmm2, xmm2 /* R */                                        \
   }
 
 // Store 8 ARGB values.
 #define STOREARGB \
-  __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
+  __asm {                                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                                       \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */                      \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */                       \
     __asm movdqu     0[edx], xmm0                                              \
     __asm movdqu     16[edx], xmm1                                             \
     __asm lea        edx,  [edx + 32]}
 
 // Store 8 BGRA values.
 #define STOREBGRA \
-  __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
+  __asm {                                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                                       \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                                       \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */                      \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */                       \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
     __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGBA values.
 #define STORERGBA \
-  __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
+  __asm {                                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                                       \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                                       \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */                      \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */                       \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
     __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGB24 values.
 #define STORERGB24 \
-  __asm {/* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
+  __asm {/* Weave into RRGB */                                                 \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */   \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */           \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */                     \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */       \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */                \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                          \
     __asm lea        edx,  [edx + 24]}
 
 // Store 8 RGB565 values.
 #define STORERGB565 \
-  __asm {/* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
+  __asm {/* Weave into RRGB */                                                 \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0 /* G */                                     \
-    __asm pslld      xmm0, 8 /* R */                                     \
-    __asm psrld      xmm3, 3 /* B */                                     \
-    __asm psrld      xmm2, 5 /* G */                                     \
-    __asm psrad      xmm0, 16 /* R */                                     \
-    __asm pand       xmm3, xmm5 /* B */                                     \
-    __asm pand       xmm2, xmm6 /* G */                                     \
-    __asm pand       xmm0, xmm7 /* R */                                     \
-    __asm por        xmm3, xmm2 /* BG */                                    \
-    __asm por        xmm0, xmm3 /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1 /* G */                                     \
-    __asm pslld      xmm1, 8 /* R */                                     \
-    __asm psrld      xmm3, 3 /* B */                                     \
-    __asm psrld      xmm2, 5 /* G */                                     \
-    __asm psrad      xmm1, 16 /* R */                                     \
-    __asm pand       xmm3, xmm5 /* B */                                     \
-    __asm pand       xmm2, xmm6 /* G */                                     \
-    __asm pand       xmm1, xmm7 /* R */                                     \
-    __asm por        xmm3, xmm2 /* BG */                                    \
-    __asm por        xmm1, xmm3 /* BGR */                                   \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */  \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */                \
+    __asm movdqa     xmm2, xmm0 /* G */                                        \
+    __asm pslld      xmm0, 8 /* R */                                           \
+    __asm psrld      xmm3, 3 /* B */                                           \
+    __asm psrld      xmm2, 5 /* G */                                           \
+    __asm psrad      xmm0, 16 /* R */                                          \
+    __asm pand       xmm3, xmm5 /* B */                                        \
+    __asm pand       xmm2, xmm6 /* G */                                        \
+    __asm pand       xmm0, xmm7 /* R */                                        \
+    __asm por        xmm3, xmm2 /* BG */                                       \
+    __asm por        xmm0, xmm3 /* BGR */                                      \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */                 \
+    __asm movdqa     xmm2, xmm1 /* G */                                        \
+    __asm pslld      xmm1, 8 /* R */                                           \
+    __asm psrld      xmm3, 3 /* B */                                           \
+    __asm psrld      xmm2, 5 /* G */                                           \
+    __asm psrad      xmm1, 16 /* R */                                          \
+    __asm pand       xmm3, xmm5 /* B */                                        \
+    __asm pand       xmm2, xmm6 /* G */                                        \
+    __asm pand       xmm1, xmm7 /* R */                                        \
+    __asm por        xmm3, xmm2 /* BG */                                       \
+    __asm por        xmm1, xmm3 /* BGR */                                      \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */               \
     __asm lea        edx, [edx + 16]}
 
 // 8 pixels.
@@ -4347,13 +4341,13 @@ static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                                           const uint8_t* src_argb1,
                                           uint8_t* dst_argb,
                                           int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4442,7 +4436,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                                               uint8_t* dst_argb,
                                               int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb0
+    mov        eax, [esp + 4]  // src_argb
     mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
@@ -4487,7 +4481,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                                              uint8_t* dst_argb,
                                              int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb0
+    mov        eax, [esp + 4]  // src_argb
     mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
@@ -4581,7 +4575,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                                                uint8_t* dst_argb,
                                                int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb0
+    mov        eax, [esp + 4]  // src_argb
     mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
@@ -4752,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm0, xmm0   // 8 B values
     movdqu     xmm5, [eax]  // G
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5  // 8 G values
-    punpcklbw  xmm0, xmm5  // 8 BG values
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
     movdqu     xmm5, [eax]  // R
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5  // 8 R values
+    packuswb   xmm5, xmm5   // 8 R values
     movdqu     xmm6, [eax]  // A
     movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
@@ -4817,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7  // B
-    phaddsw    xmm6, xmm1  // G
-    psraw      xmm0, 6  // B
-    psraw      xmm6, 6  // G
-    packuswb   xmm0, xmm0  // 8 B values
-    packuswb   xmm6, xmm6  // 8 G values
-    punpcklbw  xmm0, xmm6  // 8 BG values
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
+    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
     movdqu     xmm1, [eax]  // R
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7  // R
+    phaddsw    xmm1, xmm7   // R
     movdqu     xmm6, [eax]  // A
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
     phaddsw    xmm6, xmm7  // A
-    psraw      xmm1, 6  // R
-    psraw      xmm6, 6  // A
+    psraw      xmm1, 6     // R
+    psraw      xmm6, 6     // A
     packuswb   xmm1, xmm1  // 8 R values
     packuswb   xmm6, xmm6  // 8 A values
     punpcklbw  xmm1, xmm6  // 8 RA values
@@ -4878,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5  // first 2 pixels
-    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5   // first 2 pixels
+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
     movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5  // next 2 pixels
+    punpckhbw  xmm1, xmm5   // next 2 pixels
     pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3  // * interval_size
+    pmullw     xmm0, xmm3   // * interval_size
     movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
-    pand       xmm7, xmm6  // mask alpha
-    paddw      xmm0, xmm4  // + interval_size / 2
+    pand       xmm7, xmm6   // mask alpha
+    paddw      xmm0, xmm4   // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
@@ -4907,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                                          int width,
                                          uint32_t value) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_argb
+    mov        ecx, [esp + 12]   // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
@@ -4918,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
     movdqu     xmm0, [eax]  // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0  // first 2
-    punpckhbw  xmm1, xmm1  // next 2
-    pmulhuw    xmm0, xmm2  // argb * value
-    pmulhuw    xmm1, xmm2  // argb * value
+    punpcklbw  xmm0, xmm0   // first 2
+    punpckhbw  xmm1, xmm1   // next 2
+    pmulhuw    xmm0, xmm2   // argb * value
+    pmulhuw    xmm1, xmm2   // argb * value
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
@@ -4937,29 +4931,29 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
-    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pxor       xmm5, xmm5  // constant 0
+    pxor       xmm5, xmm5   // constant 0
 
  convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0  // first 2
-    punpckhbw  xmm1, xmm1  // next 2
-    punpcklbw  xmm2, xmm5  // first 2
-    punpckhbw  xmm3, xmm5  // next 2
-    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0   // first 2
+    punpckhbw  xmm1, xmm1   // next 2
+    punpcklbw  xmm2, xmm5   // first 2
+    punpckhbw  xmm3, xmm5   // next 2
+    pmulhuw    xmm0, xmm2   // src_argb * src_argb1 first 2
+    pmulhuw    xmm1, xmm3   // src_argb * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4977,14 +4971,14 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
                                        const uint8_t* src_argb1,
                                        uint8_t* dst_argb,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
-    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -4992,11 +4986,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
     jl         convertloop49
 
  convertloop4:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1   // src_argb + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5007,11 +5001,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
     jl         convertloop19
 
  convertloop1:
-    movd       xmm0, [eax]  // read 1 pixels from src_argb0
+    movd       xmm0, [eax]  // read 1 pixels from src_argb
     lea        eax, [eax + 4]
     movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1   // src_argb + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -5026,23 +5020,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
-    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
+    psubusb    xmm0, xmm1   // src_argb - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5056,20 +5050,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
-    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5  // constant 0
+    vpxor      ymm5, ymm5, ymm5     // constant 0
 
  convertloop:
-    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
     lea        eax, [eax + 32]
     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
     lea        esi, [esi + 32]
@@ -5077,8 +5071,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
     vpunpckhbw ymm1, ymm1, ymm1  // high 4
     vpunpcklbw ymm2, ymm3, ymm5  // low 4
     vpunpckhbw ymm3, ymm3, ymm5  // high 4
-    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -5094,19 +5088,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
                                        const uint8_t* src_argb1,
                                        uint8_t* dst_argb,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
-    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
     lea        eax, [eax + 32]
     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
     lea        esi, [esi + 32]
@@ -5124,21 +5118,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
-    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
     lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb - src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -5165,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]  // src_y0
-    mov        esi, [esp + 8 + 8]  // src_y1
+    mov        eax, [esp + 8 + 4]   // src_y0
+    mov        esi, [esp + 8 + 8]   // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
     mov        edx, [esp + 8 + 16]  // dst_sobelx
     mov        ecx, [esp + 8 + 20]  // width
@@ -5176,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm0, qword ptr [eax]      // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5221,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_y0
-    mov        esi, [esp + 4 + 8]  // src_y1
+    mov        eax, [esp + 4 + 4]   // src_y0
+    mov        esi, [esp + 4 + 8]   // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5230,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm0, qword ptr [eax]        // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5275,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
                                      int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5284,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
     pslld      xmm5, 24  // 0xff000000
 
  convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm0, [eax]        // read 16 pixels src_sobelx
     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
@@ -5323,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5357,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
     pcmpeqb    xmm5, xmm5  // alpha 255
 
  convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm0, [eax]        // read 16 pixels src_sobelx
     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
@@ -5535,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
     add        ecx, 4 - 1
     jl         l1b
 
-            // 1 pixel loop
+    // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5577,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     test       edx, 15
     jne        l4b
 
-        // 4 pixel loop
+    // 4 pixel loop
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -5623,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     add        ecx, 4 - 1
     jl         l1b
 
-            // 1 pixel loop
+    // 1 pixel loop
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel
     lea        eax, [eax + 4]
@@ -5657,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     mov        esi, [esp + 16]  // stride
     mov        edx, [esp + 20]  // dst_argb
     mov        ecx, [esp + 24]  // pointer to uv_dudv
-    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm2, qword ptr [ecx]      // uv
     movq       xmm7, qword ptr [ecx + 8]  // dudv
     mov        ecx, [esp + 28]  // width
     shl        esi, 16  // 4, stride
@@ -5666,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     sub        ecx, 4
     jl         l4b
 
-        // setup for 4 pixel loop
+    // setup for 4 pixel loop
     pshufd     xmm7, xmm7, 0x44  // dup dudv
     pshufd     xmm5, xmm5, 0  // dup 4, stride
     movdqa     xmm0, xmm2  // x0, y0, x1, y1
@@ -5678,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     addps      xmm3, xmm4
     addps      xmm4, xmm4  // dudv *= 4
 
-        // 4 pixel loop
+    // 4 pixel loop
   l4:
     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
     packssdw   xmm0, xmm1  // x, y as 8 shorts
     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
     movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
+    pshufd     xmm0, xmm0, 0x39   // shift right
     movd       edi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
+    pshufd     xmm0, xmm0, 0x39   // shift right
     movd       xmm1, [eax + esi]  // read pixel 0
     movd       xmm6, [eax + edi]  // read pixel 1
     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
@@ -5739,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]  // dst_ptr
-    mov        esi, [esp + 8 + 8]  // src_ptr
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5749,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
     je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
     cmp        eax, 128
-    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
+    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
 
     vmovd      xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5776,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
     vpaddw     ymm0, ymm0, ymm4
     vpsrlw     ymm1, ymm1, 8
     vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1            // unmutates
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
     sub        ecx, 32
@@ -5817,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
     push       esi
     push       edi
 
-    mov        edi, [esp + 8 + 4]  // dst_ptr
-    mov        esi, [esp + 8 + 8]  // src_ptr
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     sub        edi, esi
-        // Dispatch to specialized filters if applicable.
+    // Dispatch to specialized filters if applicable.
     cmp        eax, 0
     je         xloop100  // 0 /256.  Blend 100 / 0.
     cmp        eax, 128
-    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
 
     movd       xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5846,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
     movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4            // bias image by -128
+    psubb      xmm0, xmm4  // bias image by -128
     psubb      xmm1, xmm4
     movdqa     xmm2, xmm5
     movdqa     xmm3, xmm5
@@ -5895,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                                             const uint8_t* shuffler,
                                             int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // shuffler
     movdqu     xmm5, [ecx]
     mov        ecx, [esp + 16]  // width
@@ -5922,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                                            const uint8_t* shuffler,
                                            int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // shuffler
     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
     mov        ecx, [esp + 16]  // width
@@ -5960,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]  // src_y
-    mov        esi, [esp + 8 + 8]  // src_u
+    mov        eax, [esp + 8 + 4]   // src_y
+    mov        esi, [esp + 8 + 8]   // src_u
     mov        edx, [esp + 8 + 12]  // src_v
     mov        edi, [esp + 8 + 16]  // dst_frame
     mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm2, qword ptr [esi]        // U
     movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3  // UV
+    punpcklbw  xmm2, xmm3   // UV
     movdqu     xmm0, [eax]  // Y
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
@@ -5997,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]  // src_y
-    mov        esi, [esp + 8 + 8]  // src_u
+    mov        eax, [esp + 8 + 4]   // src_y
+    mov        esi, [esp + 8 + 8]   // src_u
     mov        edx, [esp + 8 + 12]  // src_v
     mov        edi, [esp + 8 + 16]  // dst_frame
     mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm2, qword ptr [esi]        // U
     movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3  // UV
+    punpcklbw  xmm2, xmm3   // UV
     movdqu     xmm0, [eax]  // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0  // UYVY
+    punpcklbw  xmm1, xmm0   // UYVY
     punpckhbw  xmm2, xmm0
     movdqu     [edi], xmm1
     movdqu     [edi + 16], xmm2
@@ -6039,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
     mov        ecx, [esp + 4 + 16] /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
-        // 2 pixel loop.
+    // 2 pixel loop.
  convertloop:
-        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+    //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
     movq       xmm0, qword ptr [eax]  // BGRABGRA
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
@@ -6091,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                                               const float* poly,
                                               int width) {
   __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
+    mov        eax, [esp + 4]  /* src_argb */
+    mov        edx, [esp + 8]  /* dst_argb */
     mov        ecx, [esp + 12] /* poly */
     vbroadcastf128 ymm4, [ecx]  // C0
     vbroadcastf128 ymm5, [ecx + 16]  // C1
@@ -6131,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
                                          float scale,
                                          int width) {
   __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
+    mov        eax, [esp + 4]  /* src */
+    mov        edx, [esp + 8]  /* dst */
     movd       xmm4, dword ptr [esp + 12] /* scale */
     mov        ecx, [esp + 16] /* width */
     mulss      xmm4, kExpBias
@@ -6140,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
     pxor       xmm5, xmm5
     sub        edx, eax
 
-        // 8 pixel loop.
+    // 8 pixel loop.
  convertloop:
     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
     add         eax, 16
@@ -6178,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
     vpxor      ymm5, ymm5, ymm5
     sub        edx, eax
 
-        // 16 pixel loop.
+    // 16 pixel loop.
  convertloop:
     vmovdqu     ymm2, [eax]  // 16 shorts
     add         eax, 32
@@ -6188,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
     vcvtdq2ps   ymm2, ymm2
     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
     vmulps      ymm2, ymm2, ymm4
-    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
+    vpsrld      ymm3, ymm3, 13    // float convert to 8 half floats truncate
     vpsrld      ymm2, ymm2, 13
     vpackssdw   ymm2, ymm2, ymm3
     vmovdqu     [eax + edx - 32], ymm2
@@ -6206,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
                                          float scale,
                                          int width) {
   __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
+    mov        eax, [esp + 4]     /* src */
+    mov        edx, [esp + 8]     /* dst */
     vbroadcastss ymm4, [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
+    mov        ecx, [esp + 16]    /* width */
     sub        edx, eax
 
-        // 16 pixel loop.
+    // 16 pixel loop.
  convertloop:
     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
     add         eax, 32
-    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
+    vcvtdq2ps   ymm2, ymm2        // convert 8 ints to floats
     vcvtdq2ps   ymm3, ymm3
     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
     vmulps      ymm3, ymm3, ymm4
-    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
+    vcvtps2ph   xmm2, ymm2, 3     // float convert to 8 half floats truncate
     vcvtps2ph   xmm3, ymm3, 3
     vmovdqu     [eax + edx + 32], xmm2
     vmovdqu     [eax + edx + 32 + 16], xmm3
@@ -6240,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
                                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4] /* dst_argb */
-    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        eax, [esp + 4 + 4]  /* dst_argb */
+    mov        esi, [esp + 4 + 8]  /* table_argb */
     mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
@@ -6274,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4] /* dst_argb */
-    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        eax, [esp + 4 + 4]  /* dst_argb */
+    mov        esi, [esp + 4 + 8]  /* table_argb */
     mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
@@ -6309,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4] /* src_argb */
-    mov        edi, [esp + 8 + 8] /* dst_argb */
+    mov        eax, [esp + 8 + 4]  /* src_argb */
+    mov        edi, [esp + 8 + 8]  /* dst_argb */
     mov        ecx, [esp + 8 + 12] /* width */
     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
@@ -6320,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
     psllw      xmm4, 8
     pxor       xmm5, xmm5
 
-        // 4 pixel loop.
+    // 4 pixel loop.
   convertloop:
     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
     pmaddubsw  xmm0, xmm3
diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc
index 4a5dc94aaa..03b0486f76 100644
--- a/third_party/libyuv/source/scale.cc
+++ b/third_party/libyuv/source/scale.cc
@@ -1446,7 +1446,8 @@ void ScalePlaneUp2_Bilinear(int src_width,
   for (x = 0; x < src_height - 1; ++x) {
     Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
     src_ptr += src_stride;
-    // TODO: Test performance of writing one row of destination at a time.
+    // TODO(fbarchard): Test performance of writing one row of destination at a
+    // time.
     dst_ptr += 2 * dst_stride;
   }
   if (!(dst_height & 1)) {
@@ -1459,7 +1460,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
 // its original width, using linear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I210 to I410 and I212 to I412.
-void ScalePlaneUp2_16_Linear(int src_width,
+void ScalePlaneUp2_12_Linear(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
@@ -1476,21 +1477,21 @@ void ScalePlaneUp2_16_Linear(int src_width,
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3;
+    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_12_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
   }
 #endif
 
@@ -1513,6 +1514,102 @@ void ScalePlaneUp2_16_Linear(int src_width,
 // its original size, using bilinear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I010 to I410 and I012 to I412.
+void ScalePlaneUp2_12_Bilinear(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleRowUp2_Bilinear_16_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+void ScalePlaneUp2_16_Linear(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr) {
+  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+  }
+}
+
 void ScalePlaneUp2_16_Bilinear(int src_width,
                                int src_height,
                                int dst_width,
@@ -1530,7 +1627,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
   if (TestCpuFlag(kCpuHasSSSE3)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3;
   }
@@ -1945,6 +2042,17 @@ void ScalePlane_16(const uint16_t* src,
                      dst_stride, src, dst);
     return;
   }
+  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+    ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst);
+    return;
+  }
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst);
+    return;
+  }
   if (filtering && dst_height > src_height) {
     ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
                             src_stride, dst_stride, src, dst, filtering);
@@ -1981,13 +2089,13 @@ void ScalePlane_12(const uint16_t* src,
   }
 
   if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
-    ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+    ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
                             src_stride, dst_stride, src, dst);
     return;
   }
   if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
       (filtering == kFilterBilinear || filtering == kFilterBox)) {
-    ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+    ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
                               src_stride, dst_stride, src, dst);
     return;
   }
diff --git a/third_party/libyuv/source/scale_any.cc b/third_party/libyuv/source/scale_any.cc
index d30f583366..965749c415 100644
--- a/third_party/libyuv/source/scale_any.cc
+++ b/third_party/libyuv/source/scale_any.cc
@@ -656,14 +656,22 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
          uint8_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
-SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3,
-         ScaleRowUp2_Linear_16_SSSE3,
+#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
+         ScaleRowUp2_Linear_12_SSSE3,
          ScaleRowUp2_Linear_16_C,
          15,
          uint16_t)
 #endif
 
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+         ScaleRowUp2_Linear_16_SSE2,
+         ScaleRowUp2_Linear_16_C,
+         7,
+         uint16_t)
+#endif
+
 #ifdef HAS_SCALEROWUP2LINEAR_AVX2
 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
          ScaleRowUp2_Linear_AVX2,
@@ -672,11 +680,19 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
          uint8_t)
 #endif
 
+#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
+SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
+         ScaleRowUp2_Linear_12_AVX2,
+         ScaleRowUp2_Linear_16_C,
+         31,
+         uint16_t)
+#endif
+
 #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
          ScaleRowUp2_Linear_16_AVX2,
          ScaleRowUp2_Linear_16_C,
-         31,
+         15,
          uint16_t)
 #endif
 
@@ -688,6 +704,14 @@ SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
          uint8_t)
 #endif
 
+#ifdef HAS_SCALEROWUP2LINEAR_12_NEON
+SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,
+         ScaleRowUp2_Linear_12_NEON,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
 #ifdef HAS_SCALEROWUP2LINEAR_16_NEON
 SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
          ScaleRowUp2_Linear_16_NEON,
@@ -744,14 +768,22 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
          uint8_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3,
-         ScaleRowUp2_Bilinear_16_SSSE3,
+#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
+         ScaleRowUp2_Bilinear_12_SSSE3,
          ScaleRowUp2_Bilinear_16_C,
          15,
          uint16_t)
 #endif
 
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3,
+         ScaleRowUp2_Bilinear_16_SSE2,
+         ScaleRowUp2_Bilinear_16_C,
+         7,
+         uint16_t)
+#endif
+
 #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
          ScaleRowUp2_Bilinear_SSSE3,
@@ -768,6 +800,14 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
          uint8_t)
 #endif
 
+#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
+         ScaleRowUp2_Bilinear_12_AVX2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
 #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
          ScaleRowUp2_Bilinear_16_AVX2,
@@ -784,11 +824,19 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
          uint8_t)
 #endif
 
+#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON,
+         ScaleRowUp2_Bilinear_12_NEON,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
 #ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
          ScaleRowUp2_Bilinear_16_NEON,
          ScaleRowUp2_Bilinear_16_C,
-         15,
+         7,
          uint16_t)
 #endif
 
@@ -860,7 +908,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
 SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
           ScaleUVRowUp2_Linear_NEON,
           ScaleUVRowUp2_Linear_C,
-          7,
+          15,
           uint8_t)
 #endif
 
@@ -868,7 +916,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
 SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
           ScaleUVRowUp2_Linear_16_NEON,
           ScaleUVRowUp2_Linear_16_C,
-          7,
+          15,
           uint16_t)
 #endif
 
@@ -966,7 +1014,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
 SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
           ScaleUVRowUp2_Bilinear_16_NEON,
           ScaleUVRowUp2_Bilinear_16_C,
-          3,
+          7,
           uint16_t)
 #endif
 
diff --git a/third_party/libyuv/source/scale_gcc.cc b/third_party/libyuv/source/scale_gcc.cc
index f03903f0be..279c5e4020 100644
--- a/third_party/libyuv/source/scale_gcc.cc
+++ b/third_party/libyuv/source/scale_gcc.cc
@@ -17,8 +17,7 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
 // Offsets for source bytes 0 to 9
 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
@@ -950,8 +949,8 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
-void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr,
+#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
   asm volatile(
@@ -1000,8 +999,8 @@ void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr,
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
-void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
+#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
@@ -1045,11 +1044,11 @@ void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
       "paddw       %%xmm3,%%xmm5                 \n"  // near+far
       "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-      "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (1, lo)
-      "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (1, hi)
+      "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
+      "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
 
-      // xmm4 xmm1   xmm0 xmm2
-      // xmm5 xmm2   xmm1 xmm3
+      // xmm0 xmm2
+      // xmm1 xmm3
 
       "movdqa      %%xmm0,%%xmm4                 \n"
       "movdqa      %%xmm1,%%xmm5                 \n"
@@ -1099,6 +1098,166 @@ void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
 }
 #endif
 
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "pcmpeqd     %%xmm4,%%xmm4                 \n"
+      "psrld       $31,%%xmm4                    \n"
+      "pslld       $1,%%xmm4                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
+      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
+
+      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
+      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
+
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+
+      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
+      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
+
+      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
+      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+
+      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "pxor        %%xmm7,%%xmm7                 \n"
+      "pcmpeqd     %%xmm6,%%xmm6                 \n"
+      "psrld       $31,%%xmm6                    \n"
+      "pslld       $3,%%xmm6                     \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
+      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
+      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
+      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
+      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
+      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
+      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      "movq        (%0,%3,2),%%xmm2              \n"
+      "movq        2(%0,%3,2),%%xmm3             \n"
+      "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
+      "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
+      "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
+      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
+      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
+      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
+      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
+      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
+      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
+      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
+      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
+
+      "packssdw    %%xmm0,%%xmm4                 \n"
+      "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packssdw    %%xmm2,%%xmm5                 \n"
+      "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
+      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
 #ifdef HAS_SCALEROWUP2LINEAR_SSSE3
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
@@ -1352,8 +1511,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
-void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
@@ -1402,8 +1561,8 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
-void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
@@ -1466,6 +1625,139 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 }
 #endif
 
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $31,%%ymm4,%%ymm4             \n"
+      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
+
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+
+      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
+      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
+
+      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
+      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
+
+      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrld      $31,%%ymm6,%%ymm6             \n"
+      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
+      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
+
+      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
+      "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
+      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
+      "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
+      "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
+      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
+      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
+      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
+      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
+      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
+      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
+
+      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
+      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
 // Reads 16xN bytes and produces 16 shorts at a time.
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
@@ -2522,7 +2814,6 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
   asm volatile(
-      "vpxor       %%xmm5,%%xmm5,%%xmm5          \n"
       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -2532,11 +2823,8 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
 
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0011000022330000
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1122000033440000
-
-      "vpunpcklwd  %%ymm5,%%ymm0,%%ymm0          \n"  // 00112233 (32b, 1u1v)
-      "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"  // 11223344 (32b, 1u1v)
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
 
       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
@@ -2564,7 +2852,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
       :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif
 
@@ -2575,7 +2863,6 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width) {
   asm volatile(
-      "vpxor       %%xmm7,%%xmm7,%%xmm7          \n"
       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
@@ -2585,10 +2872,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 
       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0011000022330000
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1122000033440000
-      "vpunpcklwd  %%ymm7,%%ymm0,%%ymm0          \n"  // 00112233 (32b, 1u1v)
-      "vpunpcklwd  %%ymm7,%%ymm1,%%ymm1          \n"  // 11223344 (32b, 1u1v)
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
@@ -2600,10 +2885,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 
       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
       "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
-      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"  // 0011000022330000
-      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"  // 1122000033440000
-      "vpunpcklwd  %%ymm7,%%ymm2,%%ymm2          \n"  // 00112233 (32b, 1u1v)
-      "vpunpcklwd  %%ymm7,%%ymm3,%%ymm3          \n"  // 11223344 (32b, 1u1v)
+      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
       "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
       "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
@@ -2652,8 +2935,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
         "+r"(dst_width)               // %2
       : "r"((intptr_t)(src_stride)),  // %3
         "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif
 
diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc
index 41dba3e8ea..6a0d6e1b49 100644
--- a/third_party/libyuv/source/scale_neon.cc
+++ b/third_party/libyuv/source/scale_neon.cc
@@ -603,7 +603,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   );
 }
 
-void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
@@ -633,7 +633,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
   );
 }
 
-void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
@@ -647,7 +647,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
-      "add         %5, %0, #2                    \n"
       "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
       "vld1.16     {q1}, [%5]!                   \n"  // 12345678 (16b)
 
@@ -655,7 +654,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
       "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
       "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
 
-      "add         %5, %1, #2                    \n"
       "vld1.16     {q2}, [%1]!                   \n"  // 01234567 (16b)
       "vld1.16     {q3}, [%6]!                   \n"  // 12345678 (16b)
 
@@ -692,6 +690,102 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   );
 }
 
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "vmov.u16    d31, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q1}, [%3]!                   \n"  // 12345678 (16b)
+
+      "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
+      "vmovl.u16   q3, d1                        \n"  // 4567 (32b)
+      "vmovl.u16   q4, d2                        \n"  // 1234 (32b)
+      "vmovl.u16   q5, d3                        \n"  // 5678 (32b)
+
+      "vmlal.u16   q2, d2, d31                   \n"
+      "vmlal.u16   q3, d3, d31                   \n"
+      "vmlal.u16   q4, d0, d31                   \n"
+      "vmlal.u16   q5, d1, d31                   \n"
+
+      "vrshrn.u32  d0, q4, #2                    \n"
+      "vrshrn.u32  d1, q5, #2                    \n"
+      "vrshrn.u32  d2, q2, #2                    \n"
+      "vrshrn.u32  d3, q3, #2                    \n"
+
+      "vst2.16     {q0, q1}, [%1]!               \n"  // store
+      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "vmov.u16    d31, #3                       \n"
+      "vmov.u32    q14, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {d0}, [%0]!                   \n"  // 0123 (16b)
+      "vld1.16     {d1}, [%5]!                   \n"  // 1234 (16b)
+      "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
+      "vmovl.u16   q3, d1                        \n"  // 1234 (32b)
+      "vmlal.u16   q2, d1, d31                   \n"
+      "vmlal.u16   q3, d0, d31                   \n"
+
+      "vld1.16     {d0}, [%1]!                   \n"  // 0123 (16b)
+      "vld1.16     {d1}, [%6]!                   \n"  // 1234 (16b)
+      "vmovl.u16   q4, d0                        \n"  // 0123 (32b)
+      "vmovl.u16   q5, d1                        \n"  // 1234 (32b)
+      "vmlal.u16   q4, d1, d31                   \n"
+      "vmlal.u16   q5, d0, d31                   \n"
+
+      "vmovq       q0, q4                        \n"
+      "vmovq       q1, q5                        \n"
+      "vmla.u32    q4, q2, q14                   \n"
+      "vmla.u32    q5, q3, q14                   \n"
+      "vmla.u32    q2, q0, q14                   \n"
+      "vmla.u32    q3, q1, q14                   \n"
+
+      "vrshrn.u32  d1, q4, #4                    \n"
+      "vrshrn.u32  d0, q5, #4                    \n"
+      "vrshrn.u32  d3, q2, #4                    \n"
+      "vrshrn.u32  d2, q3, #4                    \n"
+
+      "vst2.16     {d0, d1}, [%2]!               \n"  // store
+      "vst2.16     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #8                    \n"  // 4 sample -> 8 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+        "d31"  // Clobber List
+  );
+}
+
 void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc
index 22fedcb5a4..8656fec7fa 100644
--- a/third_party/libyuv/source/scale_neon64.cc
+++ b/third_party/libyuv/source/scale_neon64.cc
@@ -630,7 +630,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   );
 }
 
-void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
@@ -661,7 +661,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
   );
 }
 
-void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
@@ -721,6 +721,106 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   );
 }
 
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
+      "ushll2      v3.4s, v0.8h, #0              \n"  // 4567 (32b)
+      "ushll       v4.4s, v1.4h, #0              \n"  // 1234 (32b)
+      "ushll2      v5.4s, v1.8h, #0              \n"  // 5678 (32b)
+
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
+      "umlal2      v3.4s, v1.8h, v31.8h          \n"  // 3*near+far (2, odd)
+      "umlal       v4.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
+      "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (2, even)
+
+      "rshrn       v0.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far
+      "rshrn2      v0.8h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
+      "rshrn       v1.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far
+      "rshrn2      v1.8h, v3.4s, #2              \n"  // 3/4*near+1/4*far (odd)
+
+      "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store
+      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "movi        v31.4h, #3                    \n"
+      "movi        v30.4s, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 0123 (16b)
+      "ldr         d1, [%2], #8                  \n"  // 1234 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
+      "ushll       v3.4s, v1.4h, #0              \n"  // 1234 (32b)
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
+      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"  // 0123 (16b)
+      "ldr         d1, [%3], #8                  \n"  // 1234 (16b)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v4.4s, v0.4h, #0              \n"  // 0123 (32b)
+      "ushll       v5.4s, v1.4h, #0              \n"  // 1234 (32b)
+      "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
+      "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
+
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
+      "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v1.4h, v4.4s, #4              \n"  // 3/4*near+1/4*far
+      "rshrn       v0.4h, v5.4s, #4              \n"  // 3/4*near+1/4*far
+      "rshrn       v5.4h, v2.4s, #4              \n"  // 3/4*near+1/4*far
+      "rshrn       v4.4h, v3.4s, #4              \n"  // 3/4*near+1/4*far
+
+      "st2         {v0.4h, v1.4h}, [%4], #16     \n"  // store 1
+      "st2         {v4.4h, v5.4h}, [%5], #16     \n"  // store 2
+
+      "subs        %w6, %w6, #8                  \n"  // 4 sample -> 8 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
 void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
@@ -888,8 +988,8 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
       "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
       "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
 
-      "mov         v0.4s, v4.4s                  \n"
-      "mov         v1.4s, v5.4s                  \n"
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
       "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
       "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
       "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
diff --git a/third_party/libyuv/source/scale_uv.cc b/third_party/libyuv/source/scale_uv.cc
index 7b977912f9..d9a314453e 100644
--- a/third_party/libyuv/source/scale_uv.cc
+++ b/third_party/libyuv/source/scale_uv.cc
@@ -746,7 +746,8 @@ void ScaleUVBilinearUp2(int src_width,
   for (x = 0; x < src_height - 1; ++x) {
     Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
     src_ptr += src_stride;
-    // TODO: Test performance of writing one row of destination at a time.
+    // TODO(fbarchard): Test performance of writing one row of destination at a
+    // time.
     dst_ptr += 2 * dst_stride;
   }
   if (!(dst_height & 1)) {
@@ -851,7 +852,8 @@ void ScaleUVBilinearUp2_16(int src_width,
   for (x = 0; x < src_height - 1; ++x) {
     Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
     src_ptr += src_stride;
-    // TODO: Test performance of writing one row of destination at a time.
+    // TODO(fbarchard): Test performance of writing one row of destination at a
+    // time.
     dst_ptr += 2 * dst_stride;
   }
   if (!(dst_height & 1)) {
diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc
index c5fc86f3e9..ea1f95c6c3 100644
--- a/third_party/libyuv/source/scale_win.cc
+++ b/third_party/libyuv/source/scale_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    !defined(__clang__) && defined(_M_IX86)
 
 // Offsets for source bytes 0 to 9
 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
diff --git a/third_party/libyuv/unit_test/color_test.cc b/third_party/libyuv/unit_test/color_test.cc
index a81ab19a86..e2d037ff79 100644
--- a/third_party/libyuv/unit_test/color_test.cc
+++ b/third_party/libyuv/unit_test/color_test.cc
@@ -22,8 +22,7 @@ namespace libyuv {
 
 // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
 // Port to Visual C and other CPUs
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #define ERROR_FULL 5
 #define ERROR_J420 4
 #else
@@ -32,7 +31,11 @@ namespace libyuv {
 #endif
 #define ERROR_R 1
 #define ERROR_G 1
-#define ERROR_B 3
+#ifdef LIBYUV_UNLIMITED_DATA
+#define ERROR_B 1
+#else
+#define ERROR_B 18
+#endif
 
 #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF)              \
   TEST_F(LibYUVColorTest, TESTNAME) {                                          \
@@ -498,7 +501,11 @@ TEST_F(LibYUVColorTest, TestYUV) {
   YUVToRGB(240, 0, 0, &r1, &g1, &b1);
   EXPECT_EQ(57, r1);
   EXPECT_EQ(255, g1);
+#ifdef LIBYUV_UNLIMITED_DATA
+  EXPECT_EQ(3, b1);
+#else
   EXPECT_EQ(5, b1);
+#endif
 
   for (int i = 0; i < 256; ++i) {
     YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
@@ -655,9 +662,9 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
         int y = RANDOM256(y2);
         YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVJToRGB(y, u, v, &r1, &g1, &b1);
-        EXPECT_NEAR(r0, r1, 1);
-        EXPECT_NEAR(g0, g1, 1);
-        EXPECT_NEAR(b0, b1, 1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -687,8 +694,7 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
         YUVHToRGB(y, u, v, &r1, &g1, &b1);
         EXPECT_NEAR(r0, r1, ERROR_R);
         EXPECT_NEAR(g0, g1, ERROR_G);
-        // TODO(crbug.com/libyuv/862): Reduce the errors in the B channel.
-        EXPECT_NEAR(b0, b1, 15);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -716,9 +722,9 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
         int y = RANDOM256(y2);
         YUVFToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVFToRGB(y, u, v, &r1, &g1, &b1);
-        EXPECT_NEAR(r0, r1, 5);
-        EXPECT_NEAR(g0, g1, 5);
-        EXPECT_NEAR(b0, b1, 5);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -748,8 +754,7 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
         YUVUToRGB(y, u, v, &r1, &g1, &b1);
         EXPECT_NEAR(r0, r1, ERROR_R);
         EXPECT_NEAR(g0, g1, ERROR_G);
-        // TODO(crbug.com/libyuv/863): Reduce the errors in the B channel.
-        EXPECT_NEAR(b0, b1, 18);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
diff --git a/third_party/libyuv/unit_test/compare_test.cc b/third_party/libyuv/unit_test/compare_test.cc
index bd99cdd3ac..c29562cb86 100644
--- a/third_party/libyuv/unit_test/compare_test.cc
+++ b/third_party/libyuv/unit_test/compare_test.cc
@@ -344,7 +344,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64;  // 536870848
 
 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   uint32_t h1 = 0;
-  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 63) & ~63;
   align_buffer_page_end(src_a, kMaxWidth);
   align_buffer_page_end(src_b, kMaxWidth);
   memset(src_a, 255u, kMaxWidth);
diff --git a/third_party/libyuv/unit_test/convert_test.cc b/third_party/libyuv/unit_test/convert_test.cc
index 8638a84c13..3855838381 100644
--- a/third_party/libyuv/unit_test/convert_test.cc
+++ b/third_party/libyuv/unit_test/convert_test.cc
@@ -55,14 +55,14 @@ namespace libyuv {
     static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
     static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
     static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
-                  "SRC_SUBSAMP_X unsupported");                           \
+                  "SRC_SUBSAMP_X unsupported");                               \
     static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
-                  "SRC_SUBSAMP_Y unsupported");                           \
+                  "SRC_SUBSAMP_Y unsupported");                               \
     static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
-                  "DST_SUBSAMP_X unsupported");                           \
+                  "DST_SUBSAMP_X unsupported");                               \
     static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
-                  "DST_SUBSAMP_Y unsupported");                           \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
@@ -137,7 +137,7 @@ namespace libyuv {
                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
   TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
                  FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH)                  \
+                 benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH)                  \
   TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
                  FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
                  benchmark_width_, _Unaligned, +, 1, SRC_DEPTH)                \
@@ -183,8 +183,8 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
 #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
                         SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
                         W1280, N, NEG, OFF, PN, OFF_U, OFF_V)                 \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) {       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) {      \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kSizeUV =                                                       \
         SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
@@ -270,7 +270,7 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
                        SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
                        SUBSAMP_Y)                                              \
   TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4,      \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1,      \
                   _Any, +, 0, PN, OFF_U, OFF_V)                                \
   TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_,          \
@@ -318,7 +318,7 @@ int I400ToNV21(const uint8_t* src_y,
                   "DST_SUBSAMP_X unsupported");                               \
     static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
                   "DST_SUBSAMP_Y unsupported");                               \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
@@ -386,7 +386,7 @@ int I400ToNV21(const uint8_t* src_y,
                        DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
   TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                  DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
+                  DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
   TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
                   DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,          \
@@ -424,7 +424,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
                   "DST_SUBSAMP_X unsupported");                                \
     static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
                   "DST_SUBSAMP_Y unsupported");                                \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
     const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);              \
@@ -493,7 +493,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
                          DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)              \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1,        \
+                    DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1,        \
                     SRC_DEPTH)                                                 \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
@@ -537,7 +537,7 @@ TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12)
                   "DST_SUBSAMP_X unsupported");                               \
     static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
                   "DST_SUBSAMP_Y unsupported");                               \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
@@ -606,7 +606,7 @@ TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12)
                         DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
   TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
+                   DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
   TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
                    DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,          \
@@ -654,7 +654,7 @@ TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
 #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN, W1280, N, NEG, OFF)                            \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kWidth = W1280;                                                 \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
@@ -702,7 +702,7 @@ TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
 #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                       YALIGN)                                                \
   TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                 YALIGN, benchmark_width_ - 4, _Any, +, 0)                   \
+                 YALIGN, benchmark_width_ + 1, _Any, +, 0)                   \
   TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                  YALIGN, benchmark_width_, _Unaligned, +, 1)                 \
   TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
@@ -769,12 +769,14 @@ TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, AB30, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1)
 #endif
 
 #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                         YALIGN, W1280, N, NEG, OFF, ATTEN)                     \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = W1280;                                                  \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
@@ -821,7 +823,7 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN)                                                \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_ - 4, _Any, +, 0, 0)                \
+                  YALIGN, benchmark_width_ + 1, _Any, +, 0, 0)                \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_, _Unaligned, +, 1, 0)              \
   TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
@@ -928,6 +930,8 @@ TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
@@ -938,6 +942,8 @@ TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
@@ -948,6 +954,8 @@ TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
@@ -956,7 +964,7 @@ TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
 #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
                          BPP_B, W1280, N, NEG, OFF)                            \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
     const int kStrideB = kWidth * BPP_B;                                       \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
@@ -1009,7 +1017,7 @@ TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
 
 #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
   TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_ - 4, _Any, +, 0)                           \
+                   benchmark_width_ + 1, _Any, +, 0)                           \
   TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
                    benchmark_width_, _Unaligned, +, 1)                         \
   TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
@@ -1064,7 +1072,7 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2)
 #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
                        W1280, N, NEG, OFF)                                     \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = W1280;                                                  \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
@@ -1111,7 +1119,7 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2)
 
 #define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_ - 4, _Any, +, 0)                            \
+                 benchmark_width_ + 1, _Any, +, 0)                            \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
                  benchmark_width_, _Unaligned, +, 1)                          \
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
@@ -1134,6 +1142,7 @@ TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
 TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
 TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
 TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
 TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
 TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
 TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
@@ -1145,7 +1154,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
 #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
                          SUBSAMP_Y, W1280, N, NEG, OFF)                       \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
@@ -1191,7 +1200,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
 
 #define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
   TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_ - 4, _Any, +, 0)                           \
+                   benchmark_width_ + 1, _Any, +, 0)                           \
   TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
                    benchmark_width_, _Unaligned, +, 1)                         \
   TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
@@ -1208,152 +1217,166 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
 TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
 TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,  \
-                  HEIGHT_B, W1280, N, NEG, OFF)                              \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                           \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                          \
-    const int kHeight = benchmark_height_;                                   \
-    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
-    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
-    const int kStrideA =                                                     \
-        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
-    const int kStrideB =                                                     \
-        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
-      src_argb[i + OFF] = (fastrand() & 0xff);                               \
-    }                                                                        \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
-    MaskCpuFlags(disable_cpu_flags_);                                        \
-    FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \
-                     NEG kHeight);                                           \
-    MaskCpuFlags(benchmark_cpu_info_);                                       \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                        \
-      FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB,     \
-                       kWidth, NEG kHeight);                                 \
-    }                                                                        \
-    for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
-    }                                                                        \
-    free_aligned_buffer_page_end(src_argb);                                  \
-    free_aligned_buffer_page_end(dst_argb_c);                                \
-    free_aligned_buffer_page_end(dst_argb_opt);                              \
+#define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,     \
+                  EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)               \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA =                                                       \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
+    const int kStrideB =                                                       \
+        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                 \
+    align_buffer_page_end(src_argb,                                            \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+    align_buffer_page_end(dst_argb_opt,                                        \
+                          kStrideB* kHeightB*(int)sizeof(TYPE_B));             \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
+      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
+    }                                                                          \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                                 \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                             \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
+                     kStrideB, kWidth, NEG kHeight);                           \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA,                    \
+                       (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);  \
+    }                                                                          \
+    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {      \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,        \
+                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                      \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                       \
+    for (int times = 0; times < benchmark_iterations_; ++times) {              \
+      const int kWidth = (fastrand() & 63) + 1;                                \
+      const int kHeight = (fastrand() & 31) + 1;                               \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+      const int kStrideA =                                                     \
+          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+      const int kStrideB =                                                     \
+          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+      align_buffer_page_end(dst_argb_c,                                        \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      align_buffer_page_end(dst_argb_opt,                                      \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {    \
+        src_argb[i] = 0xfe;                                                    \
+      }                                                                        \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      MaskCpuFlags(disable_cpu_flags_);                                        \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,       \
+                       kStrideB, kWidth, kHeight);                             \
+      MaskCpuFlags(benchmark_cpu_info_);                                       \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,     \
+                       kStrideB, kWidth, kHeight);                             \
+      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {    \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      }                                                                        \
+      free_aligned_buffer_page_end(src_argb);                                  \
+      free_aligned_buffer_page_end(dst_argb_c);                                \
+      free_aligned_buffer_page_end(dst_argb_opt);                              \
+    }                                                                          \
   }
 
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B,     \
-                       STRIDE_B, HEIGHT_B)                                 \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                   \
-    for (int times = 0; times < benchmark_iterations_; ++times) {          \
-      const int kWidth = (fastrand() & 63) + 1;                            \
-      const int kHeight = (fastrand() & 31) + 1;                           \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
-      const int kStrideA =                                                 \
-          (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;           \
-      const int kStrideB =                                                 \
-          (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;           \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA);                 \
-      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);               \
-      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);             \
-      for (int i = 0; i < kStrideA * kHeightA; ++i) {                      \
-        src_argb[i] = (fastrand() & 0xff);                                 \
-      }                                                                    \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                         \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                       \
-      MaskCpuFlags(disable_cpu_flags_);                                    \
-      FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth,   \
-                       kHeight);                                           \
-      MaskCpuFlags(benchmark_cpu_info_);                                   \
-      FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
-                       kHeight);                                           \
-      for (int i = 0; i < kStrideB * kHeightB; ++i) {                      \
-        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                         \
-      }                                                                    \
-      free_aligned_buffer_page_end(src_argb);                              \
-      free_aligned_buffer_page_end(dst_argb_c);                            \
-      free_aligned_buffer_page_end(dst_argb_opt);                          \
-    }                                                                      \
-  }
-
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                 HEIGHT_B)                                                 \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_ - 4, _Any, +, 0)                    \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, _Unaligned, +, 1)                  \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, _Invert, -, 0)                     \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, _Opt, +, 0)                        \
-  TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                 HEIGHT_B)
-
-TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1)
-TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1)
+#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)                                 \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_ + 1, _Any, +, 0)           \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Unaligned, +, 1)         \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Invert, -, 0)            \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0)               \
+  TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)
+
+TESTATOB(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
 #endif
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1)
+TESTATOB(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
 #endif
-TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1)
+TESTATOB(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1)
-TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1)
+TESTATOB(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOB(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
 #endif
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
 #endif
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1)
-TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1)
-TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1)
-TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
 #endif
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1)  // 4
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1)
-TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1)
-TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1)
-TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1)
-TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1)
-TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)  // 4
+TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+TESTATOB(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1)
+TESTATOB(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
 #endif
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1)
-TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1)
+TESTATOB(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
 
 #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                    HEIGHT_B, W1280, N, NEG, OFF)                             \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                   \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                          \
+    const int kWidth = W1280;                                                \
     const int kHeight = benchmark_height_;                                   \
     const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
     const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
@@ -1423,7 +1446,7 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1)
 #define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                   HEIGHT_B)                                                 \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_ - 4, _Any, +, 0)                    \
+             HEIGHT_B, benchmark_width_ + 1, _Any, +, 0)                    \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
              HEIGHT_B, benchmark_width_, _Unaligned, +, 1)                  \
   TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
@@ -1437,35 +1460,39 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1)
 TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
 #endif
 
-#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF)      \
+#define TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG,   \
+                 OFF)                                                          \
   TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) {                          \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
     const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
     const int kStrideA =                                                       \
-        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);                 \
-    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA);                     \
-    align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA);                   \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
+    align_buffer_page_end(src_argb,                                            \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+    align_buffer_page_end(dst_argb_opt,                                        \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A));             \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
     memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
     memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
     MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth,           \
-             NEG kHeight);                                                     \
+    FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c,         \
+             kStrideA, kWidth, NEG kHeight);                                   \
     MaskCpuFlags(benchmark_cpu_info_);                                         \
     for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth,       \
-               NEG kHeight);                                                   \
+      FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_opt,     \
+               kStrideA, kWidth, NEG kHeight);                                 \
     }                                                                          \
     MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \
+    FMT_ATOB((TYPE_A*)dst_argb_c, kStrideA, (TYPE_A*)dst_argb_c, kStrideA,     \
+             kWidth, NEG kHeight);                                             \
     MaskCpuFlags(benchmark_cpu_info_);                                         \
-    FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth,           \
-             NEG kHeight);                                                     \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+    FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \
+             kWidth, NEG kHeight);                                             \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
       EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
     }                                                                          \
@@ -1474,18 +1501,20 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
     free_aligned_buffer_page_end(dst_argb_opt);                                \
   }
 
-#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A)                           \
-  TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \
-           0)                                                                  \
-  TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned,  \
-           +, 1)                                                               \
-  TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0)
+#define TESTSYM(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A)                  \
+  TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ + 1, \
+           _Any, +, 0)                                                        \
+  TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_,     \
+           _Unaligned, +, 1)                                                  \
+  TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_,     \
+           _Opt, +, 0)
 
-TESTSYM(ARGBToARGB, 4, 4, 1)
-TESTSYM(ARGBToBGRA, 4, 4, 1)
-TESTSYM(ARGBToABGR, 4, 4, 1)
-TESTSYM(BGRAToARGB, 4, 4, 1)
-TESTSYM(ABGRToARGB, 4, 4, 1)
+TESTSYM(ARGBToARGB, uint8_t, 4, 4, 1)
+TESTSYM(ARGBToBGRA, uint8_t, 4, 4, 1)
+TESTSYM(ARGBToABGR, uint8_t, 4, 4, 1)
+TESTSYM(BGRAToARGB, uint8_t, 4, 4, 1)
+TESTSYM(ABGRToARGB, uint8_t, 4, 4, 1)
+TESTSYM(AB64ToAR64, uint16_t, 4, 4, 1)
 
 TEST_F(LibYUVConvertTest, Test565) {
   SIMD_ALIGNED(uint8_t orig_pixels[256][4]);
@@ -2349,7 +2378,11 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
 
   // Test result matches known hash value.
   uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+#ifdef LIBYUV_UNLIMITED_DATA
+  EXPECT_EQ(dst_argb_hash, 3900633302u);
+#else
   EXPECT_EQ(dst_argb_hash, 2355976473u);
+#endif
 
   free_aligned_buffer_page_end(dst_argb);
 }
@@ -2658,7 +2691,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
 #define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                         YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C)              \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) {                \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = W1280;                                                  \
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
@@ -2711,7 +2744,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
 #define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN, FMT_C, BPP_C)                                  \
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)     \
+                  YALIGN, benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)     \
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C)   \
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
@@ -2784,11 +2817,12 @@ TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
 TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
 
 // Transitive tests.  A to B to C is same as A to C.
+// Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere.
 
 #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
                        W1280, N, NEG, OFF, FMT_C, BPP_C)                      \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {             \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) {            \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
@@ -2805,23 +2839,23 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
       src_v[i + OFF] = (fastrand() & 0xff);                                   \
     }                                                                         \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                          \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,      \
-                            src_v + OFF, kStrideUV, dst_argb_b + OFF,         \
-                            kStrideB, kWidth, NEG kHeight);                   \
-    }                                                                         \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,        \
+                          src_v + OFF, kStrideUV, dst_argb_b + OFF, kStrideB, \
+                          kWidth, NEG kHeight);                               \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
     const int kStrideC = kWidth * BPP_C;                                      \
     align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
     align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
-    FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV,        \
-                          src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \
-                          kWidth, NEG kHeight);                               \
-    /* Convert B to C */                                                      \
-    FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
-                     kWidth, kHeight);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV,      \
+                            src_v + OFF, kStrideUV, dst_argb_c + OFF,         \
+                            kStrideC, kWidth, NEG kHeight);                   \
+      /* Convert B to C */                                                    \
+      FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF,         \
+                       kStrideC, kWidth, kHeight);                            \
+    }                                                                         \
     for (int i = 0; i < kStrideC * kHeight; ++i) {                            \
       EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
     }                                                                         \
@@ -2836,7 +2870,7 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
 #define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
                       FMT_C, BPP_C)                                          \
   TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                 benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)             \
+                 benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)             \
   TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
                  benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C)           \
   TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
@@ -2844,26 +2878,28 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
   TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
                  benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
 
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3)
 TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
-TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
-TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RGB24, 3)
 TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
@@ -2899,8 +2935,8 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
 
 #define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
                         W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN)               \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {              \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) {             \
+    const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
     const int kSizeUV =                                                        \
@@ -2919,25 +2955,25 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
       src_v[i + OFF] = (fastrand() & 0xff);                                    \
     }                                                                          \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(                                                   \
-          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),      \
-          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,      \
-          dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);             \
-    }                                                                          \
+    FMT_PLANAR##To##FMT_B(                                                     \
+        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,        \
+        dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);               \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
     const int kStrideC = kWidth * BPP_C;                                       \
     align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
     align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
-    FMT_PLANAR##To##FMT_C(                                                     \
-        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),        \
-        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,        \
-        dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN);               \
-    /* Convert B to C */                                                       \
-    FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC,  \
-                     kWidth, kHeight);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_C(                                                   \
+          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),      \
+          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,      \
+          dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN);             \
+      /* Convert B to C */                                                     \
+      FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF,          \
+                       kStrideC, kWidth, kHeight);                             \
+    }                                                                          \
     for (int i = 0; i < kStrideC * kHeight; ++i) {                             \
       EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
     }                                                                          \
@@ -2953,7 +2989,7 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
 #define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
                        FMT_C, BPP_C)                                          \
   TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                  benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0)          \
+                  benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C, 0)          \
   TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
                   benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0)        \
   TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
@@ -3000,8 +3036,8 @@ TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 
 #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
                       OFF, FMT_C, BPP_C)                                       \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_##FMT_C##N) {                   \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##To##FMT_C##N) {                  \
+    const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
     const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
@@ -3009,21 +3045,21 @@ TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
     align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
     MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \
-                       kWidth, NEG kHeight);                                   \
-    }                                                                          \
+    FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB,   \
+                     kWidth, NEG kHeight);                                     \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
     const int kStrideC = kWidth * BPP_C;                                       \
     align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
     align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
-    FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC,   \
-                     kWidth, NEG kHeight);                                     \
-    /* Convert B to C */                                                       \
-    FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC,  \
-                     kWidth, kHeight);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \
+                       kWidth, NEG kHeight);                                   \
+      /* Convert B to C */                                                     \
+      FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF,          \
+                       kStrideC, kWidth, kHeight);                             \
+    }                                                                          \
     for (int i = 0; i < kStrideC * kHeight; i += 4) {                          \
       EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
       EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
@@ -3038,7 +3074,7 @@ TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 
 #define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
   TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B,                    \
-                benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)              \
+                benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)              \
   TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
                 _Unaligned, +, 1, FMT_C, BPP_C)                              \
   TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
@@ -3161,91 +3197,457 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
 }
 #endif  // HAS_ABGRTOAR30ROW_AVX2
 
+// Provide matrix wrappers for 12 bit YUV
+#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+
+#define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define H410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define H410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define U410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define U410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define I410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define H410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define H410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define U410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+
 // TODO(fbarchard): Fix clamping issue affected by U channel.
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,   \
-                         ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF)         \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                    \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                        \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);               \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                  \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                    \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);         \
-    const int kBpc = 2;                                                    \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);            \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                    \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                    \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);           \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);         \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                           \
-      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
-    }                                                                      \
-    for (int i = 0; i < kSizeUV; ++i) {                                    \
-      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
-      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
-    }                                                                      \
-    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                      \
-    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                  \
-    MaskCpuFlags(disable_cpu_flags_);                                      \
-    FMT_PLANAR##To##FMT_B(                                                 \
-        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                 \
-        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,              \
-        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,              \
-        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                 \
-    MaskCpuFlags(benchmark_cpu_info_);                                     \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                      \
-      FMT_PLANAR##To##FMT_B(                                               \
-          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,               \
-          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,            \
-          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,            \
-          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);             \
-    }                                                                      \
-    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                   \
-      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);             \
-    }                                                                      \
-    free_aligned_buffer_page_end(src_y);                                   \
-    free_aligned_buffer_page_end(src_u);                                   \
-    free_aligned_buffer_page_end(src_v);                                   \
-    free_aligned_buffer_page_end(dst_argb_c);                              \
-    free_aligned_buffer_page_end(dst_argb_opt);                            \
-  }
-
-#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN)                                                \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_ - 4, _Any, +, 0, 0)                \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, _Unaligned, +, 1, 1)              \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, _Invert, -, 0, 0)                 \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, _Opt, +, 0, 0)
-
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1)
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1)
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1)
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1)
-TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1)
-TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1)
-TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1)
-TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1)
-TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1)
-TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1)
-TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1)
-TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1)
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
+                         BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF)     \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
+    const int kBpc = 2;                                                       \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
+    }                                                                         \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
+      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \
+      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \
+    }                                                                         \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_PLANAR##To##FMT_B(                                                    \
+        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                    \
+        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,                 \
+        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,                 \
+        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                    \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_B(                                                  \
+          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                  \
+          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,               \
+          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,               \
+          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
+    }                                                                         \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+  }
+
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
+                        BPP_B, ALIGN, YALIGN)                                \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_ + 1, _Any, +, 0, 0)       \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 1, 1)     \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0)        \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+
+// These conversions are only optimized for x86
+#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1)
+
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1)
-TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1)
-TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1)
-TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1)
-TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1)
-TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1)
-TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1)
-TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1)
-TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1)
-#endif
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
+#endif  // LITTLE_ENDIAN_ONLY_TEST
+#endif  // ENABLE_SLOW_TESTS
+
+#define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
+                          ALIGN, YALIGN, W1280, N, NEG, OFF, ATTEN, S_DEPTH)   \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    const int kBpc = 2;                                                        \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      reinterpret_cast<uint16_t*>(src_y + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+      reinterpret_cast<uint16_t*>(src_a + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      reinterpret_cast<uint16_t*>(src_u + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+      reinterpret_cast<uint16_t*>(src_v + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+    }                                                                          \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + OFF), kWidth,    \
+                          reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV, \
+                          reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV, \
+                          reinterpret_cast<uint16_t*>(src_a + OFF), kWidth,    \
+                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight,     \
+                          ATTEN);                                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(                                                   \
+          reinterpret_cast<uint16_t*>(src_y + OFF), kWidth,                    \
+          reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV,                 \
+          reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV,                 \
+          reinterpret_cast<uint16_t*>(src_a + OFF), kWidth,                    \
+          dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);           \
+    }                                                                          \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(src_a);                                       \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,    \
+                         ALIGN, YALIGN, S_DEPTH)                            \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)   \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Unaligned, +, 1, 0, S_DEPTH) \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH)    \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)       \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH)
+
+#define I010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define I010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define J010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define I210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define I210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define J210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define I410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define I410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define J410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+
+// These conversions are only optimized for x86
+#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+#endif  // ENABLE_SLOW_TESTS
+
+#define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,     \
+                           ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH)  \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;         \
+    const int kBpc = 2;                                                        \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);                \
+    align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);             \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                           \
+          (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      reinterpret_cast<uint16_t*>(src_uv + SOFF)[i] =                          \
+          (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
+    }                                                                          \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                          \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                      \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,   \
+                          reinterpret_cast<uint16_t*>(src_uv + SOFF),          \
+                          kStrideUV, dst_argb_c + DOFF, kStrideB, kWidth,      \
+                          NEG kHeight);                                        \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+                            reinterpret_cast<uint16_t*>(src_uv + SOFF),        \
+                            kStrideUV, dst_argb_opt + DOFF, kStrideB, kWidth,  \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                 \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,    \
+                          ALIGN, YALIGN, S_DEPTH)                            \
+  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                     YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)   \
+  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                     YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH) \
+  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                     YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH)    \
+  TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                     YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+
+#define P010ToARGB(a, b, c, d, e, f, g, h) \
+  P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P210ToARGB(a, b, c, d, e, f, g, h) \
+  P210ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P010ToAR30(a, b, c, d, e, f, g, h) \
+  P010ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P210ToAR30(a, b, c, d, e, f, g, h) \
+  P210ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P012ToARGB(a, b, c, d, e, f, g, h) \
+  P012ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P212ToARGB(a, b, c, d, e, f, g, h) \
+  P212ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P012ToAR30(a, b, c, d, e, f, g, h) \
+  P012ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P212ToAR30(a, b, c, d, e, f, g, h) \
+  P212ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P016ToARGB(a, b, c, d, e, f, g, h) \
+  P016ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P216ToARGB(a, b, c, d, e, f, g, h) \
+  P216ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P016ToAR30(a, b, c, d, e, f, g, h) \
+  P016ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P216ToAR30(a, b, c, d, e, f, g, h) \
+  P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
+TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
+TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
+TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
+TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
+TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
+TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
+TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
+TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
+TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
+TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
+#endif  // LITTLE_ENDIAN_ONLY_TEST
+#endif  // defined(ENABLE_SLOW_TESTS)
 
 static int Clamp(int y) {
   if (y < 0) {
diff --git a/third_party/libyuv/unit_test/cpu_test.cc b/third_party/libyuv/unit_test/cpu_test.cc
index 7264de0801..4035cf2bbc 100644
--- a/third_party/libyuv/unit_test/cpu_test.cc
+++ b/third_party/libyuv/unit_test/cpu_test.cc
@@ -72,26 +72,98 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
 #endif
 }
 
-TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
-#if defined(__aarch64__)
-  printf("Arm64 build\n");
+TEST_F(LibYUVBaseTest, TestCompilerMacros) {
+  // Tests all macros used in public headers.
+#ifdef __ATOMIC_RELAXED
+  printf("__ATOMIC_RELAXED %d\n", __ATOMIC_RELAXED);
 #endif
-#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
-  printf("Neon build enabled\n");
+#ifdef __cplusplus
+  printf("__cplusplus %ld\n", __cplusplus);
 #endif
-#if defined(__x86_64__) || defined(_M_X64)
-  printf("x64 build\n");
+#ifdef __clang_major__
+  printf("__clang_major__ %d\n", __clang_major__);
+#endif
+#ifdef __clang_minor__
+  printf("__clang_minor__ %d\n", __clang_minor__);
+#endif
+#ifdef __GNUC__
+  printf("__GNUC__ %d\n", __GNUC__);
+#endif
+#ifdef __GNUC_MINOR__
+  printf("__GNUC_MINOR__ %d\n", __GNUC_MINOR__);
+#endif
+#ifdef __i386__
+  printf("__i386__ %d\n", __i386__);
+#endif
+#ifdef __mips
+  printf("__mips %d\n", __mips);
+#endif
+#ifdef __mips_isa_rev
+  printf("__mips_isa_rev %d\n", __mips_isa_rev);
+#endif
+#ifdef __x86_64__
+  printf("__x86_64__ %d\n", __x86_64__);
 #endif
 #ifdef _MSC_VER
   printf("_MSC_VER %d\n", _MSC_VER);
 #endif
-#if !defined(LIBYUV_DISABLE_X86) &&                      \
-    (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
-     defined(VISUALC_HAS_AVX2))
-  printf("Has AVX2 1\n");
-#else
-  printf("Has AVX2 0\n");
-// If compiler does not support AVX2, the following function not expected:
+#ifdef __aarch64__
+  printf("__aarch64__ %d\n", __aarch64__);
+#endif
+#ifdef __APPLE__
+  printf("__APPLE__ %d\n", __APPLE__);
+#endif
+#ifdef __arm__
+  printf("__arm__ %d\n", __arm__);
+#endif
+#ifdef __clang__
+  printf("__clang__ %d\n", __clang__);
+#endif
+#ifdef __CLR_VER
+  printf("__CLR_VER %d\n", __CLR_VER);
+#endif
+#ifdef __CYGWIN__
+  printf("__CYGWIN__ %d\n", __CYGWIN__);
+#endif
+#ifdef __llvm__
+  printf("__llvm__ %d\n", __llvm__);
+#endif
+#ifdef __mips_msa
+  printf("__mips_msa %d\n", __mips_msa);
+#endif
+#ifdef __native_client__
+  printf("__native_client__ %d\n", __native_client__);
+#endif
+#ifdef __pic__
+  printf("__pic__ %d\n", __pic__);
+#endif
+#ifdef __pnacl__
+  printf("__pnacl__ %d\n", __pnacl__);
+#endif
+#ifdef _M_IX86
+  printf("_M_IX86 %d\n", _M_IX86);
+#endif
+#ifdef _M_X64
+  printf("_M_X64 %d\n", _M_X64);
+#endif
+#ifdef _MIPS_ARCH_LOONGSON3A
+  printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
+#endif
+#ifdef _WIN32
+  printf("_WIN32 %d\n", _WIN32);
+#endif
+#ifdef GG_LONGLONG
+  printf("GG_LONGLONG %d\n", GG_LONGLONG);
+#endif
+#ifdef INT_TYPES_DEFINED
+  printf("INT_TYPES_DEFINED\n");
+#endif
+#ifdef __has_feature
+  printf("__has_feature\n");
+#if __has_feature(memory_sanitizer)
+  printf("__has_feature(memory_sanitizer) %d\n",
+         __has_feature(memory_sanitizer));
+#endif
 #endif
 }
 
diff --git a/third_party/libyuv/unit_test/planar_test.cc b/third_party/libyuv/unit_test/planar_test.cc
index fd1755cdca..5c60842136 100644
--- a/third_party/libyuv/unit_test/planar_test.cc
+++ b/third_party/libyuv/unit_test/planar_test.cc
@@ -155,7 +155,7 @@ static int TestAttenuateI(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
-  int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestAttenuateI(benchmark_width_ + 1, benchmark_height_,
                                 benchmark_iterations_, disable_cpu_flags_,
                                 benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 2);
@@ -228,7 +228,7 @@ static int TestUnattenuateI(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
-  int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 2);
@@ -1076,7 +1076,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
 #define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \
                  N, NEG, OFF)                                                 \
   TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kStrideA =                                                      \
         (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
@@ -1108,7 +1108,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
   }
 
 #define TESTINTERPOLATE(TERP)                                                \
-  TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0)   \
+  TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ + 1, TERP, _Any, +, 0)   \
   TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
   TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0)    \
   TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
@@ -1174,7 +1174,7 @@ static int TestBlend(int width,
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
   int max_diff =
-      TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+      TestBlend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
   EXPECT_LE(max_diff, 1);
 }
@@ -1280,7 +1280,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
                  disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
 }
 TEST_F(LibYUVPlanarTest, BlendPlane_Any) {
-  TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+  TestBlendPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                  disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
 }
 TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
@@ -1375,7 +1375,7 @@ TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
 
 // TODO(fbarchard): DISABLED because _Any uses C.  Avoid C and re-enable.
 TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
-  TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+  TestI420Blend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
 }
 TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
@@ -1524,7 +1524,7 @@ static int TestMultiply(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
-  int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestMultiply(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 1);
@@ -1599,7 +1599,7 @@ static int TestAdd(int width,
 
 TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
   int max_diff =
-      TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestAdd(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
               disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 1);
 }
@@ -1672,7 +1672,7 @@ static int TestSubtract(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
-  int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestSubtract(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 1);
@@ -1745,7 +1745,7 @@ static int TestSobel(int width,
 
 TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
   int max_diff =
-      TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestSobel(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
   EXPECT_EQ(0, max_diff);
 }
@@ -1818,7 +1818,7 @@ static int TestSobelToPlane(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
-  int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestSobelToPlane(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
   EXPECT_EQ(0, max_diff);
@@ -1890,7 +1890,7 @@ static int TestSobelXY(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
-  int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestSobelXY(benchmark_width_ + 1, benchmark_height_,
                              benchmark_iterations_, disable_cpu_flags_,
                              benchmark_cpu_info_, +1, 0);
   EXPECT_EQ(0, max_diff);
@@ -1966,29 +1966,35 @@ static int TestBlur(int width,
   return max_diff;
 }
 
+#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+#define DISABLED_ARM(name) name
+#else
+#define DISABLED_ARM(name) DISABLED_##name
+#endif
+
 static const int kBlurSize = 55;
-TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Any)) {
   int max_diff =
-      TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Unaligned)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Invert)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Opt)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
@@ -1996,35 +2002,35 @@ TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
 }
 
 static const int kBlurSmallSize = 5;
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Any)) {
   int max_diff =
-      TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Unaligned)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Invert)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Opt)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) {
   SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
   SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]);
@@ -2398,8 +2404,7 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(dst_pixels_opt, kPixels);
   align_buffer_page_end(dst_pixels_c, kPixels);
@@ -2427,8 +2432,7 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(orig_pixels, kPixels);
   align_buffer_page_end(dst_pixels_opt, kPixels * 4);
   align_buffer_page_end(dst_pixels_c, kPixels * 4);
@@ -2505,7 +2509,7 @@ static int TestARGBRect(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
-  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0, 4);
   EXPECT_EQ(0, max_diff);
@@ -2533,7 +2537,7 @@ TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SetPlane_Any) {
-  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0, 1);
   EXPECT_EQ(0, max_diff);
@@ -2561,35 +2565,25 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
-  align_buffer_page_end(src_pixels, kPixels * 2);
-  align_buffer_page_end(tmp_pixels_u, kPixels);
-  align_buffer_page_end(tmp_pixels_v, kPixels);
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_u, kPixels);
+  align_buffer_page_end(src_pixels_v, kPixels);
   align_buffer_page_end(dst_pixels_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_c, kPixels * 2);
 
-  MemRandomize(src_pixels, kPixels * 2);
-  MemRandomize(tmp_pixels_u, kPixels);
-  MemRandomize(tmp_pixels_v, kPixels);
+  MemRandomize(src_pixels_u, kPixels);
+  MemRandomize(src_pixels_v, kPixels);
   MemRandomize(dst_pixels_opt, kPixels * 2);
   MemRandomize(dst_pixels_c, kPixels * 2);
 
   MaskCpuFlags(disable_cpu_flags_);
-  SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
-               tmp_pixels_v, benchmark_width_, benchmark_width_,
-               benchmark_height_);
-  MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+  MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
                dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
                benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
-  SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
-               tmp_pixels_v, benchmark_width_, benchmark_width_,
-               benchmark_height_);
-
   for (int i = 0; i < benchmark_iterations_; ++i) {
-    MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+    MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
                  dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
                  benchmark_height_);
   }
@@ -2598,60 +2592,127 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
-  free_aligned_buffer_page_end(src_pixels);
-  free_aligned_buffer_page_end(tmp_pixels_u);
-  free_aligned_buffer_page_end(tmp_pixels_v);
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+// 16 bit channel split and merge
+TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_u, kPixels * 2);
+  align_buffer_page_end(src_pixels_v, kPixels * 2);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
+  MemRandomize(src_pixels_u, kPixels * 2);
+  MemRandomize(src_pixels_v, kPixels * 2);
+  MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
+  MemRandomize(dst_pixels_c, kPixels * 2 * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
+                  (const uint16_t*)src_pixels_v, benchmark_width_,
+                  (uint16_t*)dst_pixels_c, benchmark_width_ * 2,
+                  benchmark_width_, benchmark_height_, 12);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
+                    (const uint16_t*)src_pixels_v, benchmark_width_,
+                    (uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
+                    benchmark_width_, benchmark_height_, 12);
+  }
+
+  for (int i = 0; i < kPixels * 2 * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
   free_aligned_buffer_page_end(dst_pixels_opt);
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
 TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 2);
-  align_buffer_page_end(tmp_pixels_u, kPixels);
-  align_buffer_page_end(tmp_pixels_v, kPixels);
-  align_buffer_page_end(dst_pixels_opt, kPixels * 2);
-  align_buffer_page_end(dst_pixels_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_u_c, kPixels);
+  align_buffer_page_end(dst_pixels_v_c, kPixels);
+  align_buffer_page_end(dst_pixels_u_opt, kPixels);
+  align_buffer_page_end(dst_pixels_v_opt, kPixels);
 
   MemRandomize(src_pixels, kPixels * 2);
-  MemRandomize(tmp_pixels_u, kPixels);
-  MemRandomize(tmp_pixels_v, kPixels);
-  MemRandomize(dst_pixels_opt, kPixels * 2);
-  MemRandomize(dst_pixels_c, kPixels * 2);
+  MemRandomize(dst_pixels_u_c, kPixels);
+  MemRandomize(dst_pixels_v_c, kPixels);
+  MemRandomize(dst_pixels_u_opt, kPixels);
+  MemRandomize(dst_pixels_v_opt, kPixels);
 
   MaskCpuFlags(disable_cpu_flags_);
-  SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
-               tmp_pixels_v, benchmark_width_, benchmark_width_,
-               benchmark_height_);
-  MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
-               dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
-               benchmark_height_);
+  SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c,
+               benchmark_width_, dst_pixels_v_c, benchmark_width_,
+               benchmark_width_, benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
-    SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u,
-                 benchmark_width_, tmp_pixels_v, benchmark_width_,
+    SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt,
+                 benchmark_width_, dst_pixels_v_opt, benchmark_width_,
                  benchmark_width_, benchmark_height_);
   }
-  MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
-               dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
-               benchmark_height_);
 
-  for (int i = 0; i < kPixels * 2; ++i) {
-    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
-  free_aligned_buffer_page_end(tmp_pixels_u);
-  free_aligned_buffer_page_end(tmp_pixels_v);
-  free_aligned_buffer_page_end(dst_pixels_opt);
-  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_u_c);
+  free_aligned_buffer_page_end(dst_pixels_v_c);
+  free_aligned_buffer_page_end(dst_pixels_u_opt);
+  free_aligned_buffer_page_end(dst_pixels_v_opt);
+}
+
+// 16 bit channel split
+TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
+  MemRandomize(src_pixels, kPixels * 2 * 2);
+  MemRandomize(dst_pixels_u_c, kPixels * 2);
+  MemRandomize(dst_pixels_v_c, kPixels * 2);
+  MemRandomize(dst_pixels_u_opt, kPixels * 2);
+  MemRandomize(dst_pixels_v_opt, kPixels * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                  (uint16_t*)dst_pixels_u_c, benchmark_width_,
+                  (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
+                  benchmark_height_, 10);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                    (uint16_t*)dst_pixels_u_opt, benchmark_width_,
+                    (uint16_t*)dst_pixels_v_opt, benchmark_width_,
+                    benchmark_width_, benchmark_height_, 10);
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_u_c);
+  free_aligned_buffer_page_end(dst_pixels_v_c);
+  free_aligned_buffer_page_end(dst_pixels_u_opt);
+  free_aligned_buffer_page_end(dst_pixels_v_opt);
 }
 
 TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
   // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 2);
   align_buffer_page_end(dst_pixels_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_c, kPixels * 2);
@@ -2681,7 +2742,7 @@ TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
 
 TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
   // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2730,7 +2791,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
 
 TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
   // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2777,8 +2838,7 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2832,8 +2892,7 @@ TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2887,8 +2946,7 @@ TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2938,8 +2996,7 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2987,11 +3044,166 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
+// Merge 4 channels
+#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF)      \
+  TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) {                        \
+    const int kWidth = W1280;                                               \
+    const int kPixels = kWidth * benchmark_height_;                         \
+    align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE));       \
+    align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE));     \
+    MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF);              \
+    memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE));                   \
+    memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE));                 \
+    STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF);     \
+    STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF);     \
+    STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF);     \
+    STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF);     \
+    DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c);           \
+    DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt);       \
+    MaskCpuFlags(disable_cpu_flags_);                                       \
+    FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b,   \
+                kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4,     \
+                kWidth, NEG benchmark_height_, DEPTH);                      \
+    MaskCpuFlags(benchmark_cpu_info_);                                      \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
+      FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+                  kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \
+                  kWidth, NEG benchmark_height_, DEPTH);                    \
+    }                                                                       \
+    for (int i = 0; i < kPixels * 4; ++i) {                                 \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+    }                                                                       \
+    free_aligned_buffer_page_end(src_memory_r);                             \
+    free_aligned_buffer_page_end(src_memory_g);                             \
+    free_aligned_buffer_page_end(src_memory_b);                             \
+    free_aligned_buffer_page_end(src_memory_a);                             \
+    free_aligned_buffer_page_end(dst_memory_c);                             \
+    free_aligned_buffer_page_end(dst_memory_opt);                           \
+  }
+
+// Merge 3 channel RGB into 4 channel XRGB with opaque alpha
+#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF)     \
+  TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) {                 \
+    const int kWidth = W1280;                                               \
+    const int kPixels = kWidth * benchmark_height_;                         \
+    align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE));       \
+    align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE));     \
+    MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF);              \
+    memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE));                   \
+    memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE));                 \
+    STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF);     \
+    STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF);     \
+    STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF);     \
+    DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c);           \
+    DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt);       \
+    MaskCpuFlags(disable_cpu_flags_);                                       \
+    FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b,   \
+                kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth,          \
+                NEG benchmark_height_, DEPTH);                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                      \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
+      FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+                  kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth,      \
+                  NEG benchmark_height_, DEPTH);                            \
+    }                                                                       \
+    for (int i = 0; i < kPixels * 4; ++i) {                                 \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+    }                                                                       \
+    free_aligned_buffer_page_end(src_memory_r);                             \
+    free_aligned_buffer_page_end(src_memory_g);                             \
+    free_aligned_buffer_page_end(src_memory_b);                             \
+    free_aligned_buffer_page_end(dst_memory_c);                             \
+    free_aligned_buffer_page_end(dst_memory_opt);                           \
+  }
+
+#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH)                              \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +,  \
+                  1)                                                           \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0)  \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)     \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +,   \
+                   0)                                                          \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
+                   1)                                                          \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
+
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10)
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12)
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
+
+#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF)      \
+  TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) {                        \
+    const int kWidth = W1280;                                               \
+    const int kPixels = kWidth * benchmark_height_;                         \
+    align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE));       \
+    align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE));     \
+    MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF);              \
+    STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF);     \
+    STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF);     \
+    STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF);     \
+    DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c);           \
+    DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt);       \
+    memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE));                   \
+    memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE));                 \
+    MaskCpuFlags(disable_cpu_flags_);                                       \
+    FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b,   \
+                kWidth, dst_pixels_c, kWidth * 4, kWidth,                   \
+                NEG benchmark_height_, DEPTH);                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                      \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
+      FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+                  kWidth, dst_pixels_opt, kWidth * 4, kWidth,               \
+                  NEG benchmark_height_, DEPTH);                            \
+    }                                                                       \
+    for (int i = 0; i < kPixels * 4; ++i) {                                 \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+    }                                                                       \
+    free_aligned_buffer_page_end(src_memory_r);                             \
+    free_aligned_buffer_page_end(src_memory_g);                             \
+    free_aligned_buffer_page_end(src_memory_b);                             \
+    free_aligned_buffer_page_end(dst_memory_c);                             \
+    free_aligned_buffer_page_end(dst_memory_opt);                           \
+  }
+
+#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH)                              \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +,  \
+                  1)                                                           \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0)  \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
+
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10)
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12)
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
+
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
   // Round count up to multiple of 16
   const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
   align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@@ -3035,8 +3247,9 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_MULTIPLYROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  // Round count up to multiple of 32
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -3072,8 +3285,7 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
 #endif  // HAS_MULTIPLYROW_16_AVX2
 
 TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels);
   align_buffer_page_end(dst_pixels_y_c, kPixels);
@@ -3152,8 +3364,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
 #endif  // ENABLE_ROW_TESTS
 
 TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
-  // Round count up to multiple of 16
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels_y, kPixels);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
diff --git a/third_party/libyuv/unit_test/rotate_argb_test.cc b/third_party/libyuv/unit_test/rotate_argb_test.cc
index 3208b66a2a..01ed69ca55 100644
--- a/third_party/libyuv/unit_test/rotate_argb_test.cc
+++ b/third_party/libyuv/unit_test/rotate_argb_test.cc
@@ -156,29 +156,29 @@ TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
diff --git a/third_party/libyuv/unit_test/rotate_test.cc b/third_party/libyuv/unit_test/rotate_test.cc
index 61941e63e0..1bab584fa1 100644
--- a/third_party/libyuv/unit_test/rotate_test.cc
+++ b/third_party/libyuv/unit_test/rotate_test.cc
@@ -108,29 +108,29 @@ TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
 // Odd width tests work but disabled because they use C code and can be
 // tested by passing an odd width command line or environment variable.
 TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
@@ -225,29 +225,29 @@ TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
 // Odd width tests work but disabled because they use C code and can be
 // tested by passing an odd width command line or environment variable.
 TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
@@ -340,29 +340,29 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
                  benchmark_iterations_, disable_cpu_flags_,
                  benchmark_cpu_info_);
 }
diff --git a/third_party/libyuv/unit_test/scale_argb_test.cc b/third_party/libyuv/unit_test/scale_argb_test.cc
index ac9766124f..48ad75eafd 100644
--- a/third_party/libyuv/unit_test/scale_argb_test.cc
+++ b/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -114,8 +114,8 @@ static int ARGBTestFilter(int src_width,
   return max_diff;
 }
 
-static const int kTileX = 8;
-static const int kTileY = 8;
+static const int kTileX = 64;
+static const int kTileY = 64;
 
 static int TileARGBScale(const uint8_t* src_argb,
                          int src_stride_argb,
@@ -232,7 +232,7 @@ static int ARGBClipTestFilter(int src_width,
 #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
 #define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff)          \
   TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) {                \
     int diff = ARGBTestFilter(                                               \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
@@ -241,7 +241,7 @@ static int ARGBClipTestFilter(int src_width,
         benchmark_cpu_info_);                                                \
     EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
-  TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) {            \
+  TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \
     int diff = ARGBClipTestFilter(                                           \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
@@ -251,11 +251,19 @@ static int ARGBClipTestFilter(int src_width,
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom)         \
-  TEST_FACTOR1(name, None, nom, denom, 0)     \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
-  TEST_FACTOR1(name, Box, nom, denom, 3)
+#ifdef ENABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom)           \
+  TEST_FACTOR1(, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(, name, Box, nom, denom, 3)
+#else
+#define TEST_FACTOR(name, nom, denom)                    \
+  TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(DISABLED_, name, Box, nom, denom, 3)
+#endif
 
 TEST_FACTOR(2, 1, 2)
 TEST_FACTOR(4, 1, 4)
@@ -268,7 +276,7 @@ TEST_FACTOR(3, 1, 3)
 #undef SX
 #undef DX
 
-#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff)        \
   TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {             \
     int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width,      \
                               height, kFilter##filter, benchmark_iterations_,  \
@@ -282,13 +290,15 @@ TEST_FACTOR(3, 1, 3)
                               benchmark_cpu_info_);                            \
     EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
-  TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) {         \
+  TEST_F(LibYUVScaleTest,                                                      \
+         DISABLED_##name##ClipTo##width##x##height##_##filter) {               \
     int diff =                                                                 \
         ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
                            kFilter##filter, benchmark_iterations_);            \
     EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
-  TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) {       \
+  TEST_F(LibYUVScaleTest,                                                      \
+         DISABLED_##name##ClipFrom##width##x##height##_##filter) {             \
     int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_),        \
                                   Abs(benchmark_height_), kFilter##filter,     \
                                   benchmark_iterations_);                      \
@@ -296,13 +306,20 @@ TEST_FACTOR(3, 1, 3)
   }
 
 /// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height)       \
-  TEST_SCALETO1(name, width, height, None, 0)   \
-  TEST_SCALETO1(name, width, height, Linear, 3) \
-  TEST_SCALETO1(name, width, height, Bilinear, 3)
+#ifdef ENABLE_SLOW_TESTS
+#define TEST_SCALETO(name, width, height)         \
+  TEST_SCALETO1(, name, width, height, None, 0)   \
+  TEST_SCALETO1(, name, width, height, Linear, 3) \
+  TEST_SCALETO1(, name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height)                  \
+  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)   \
+  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3)
+#endif
 
 TEST_SCALETO(ARGBScale, 1, 1)
-TEST_SCALETO(ARGBScale, 256, 144)  /* 128x72 * 2 */
+TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */
 TEST_SCALETO(ARGBScale, 320, 240)
 TEST_SCALETO(ARGBScale, 569, 480)
 TEST_SCALETO(ARGBScale, 640, 360)
diff --git a/third_party/libyuv/unit_test/scale_test.cc b/third_party/libyuv/unit_test/scale_test.cc
index d24806a661..6da6b574d1 100644
--- a/third_party/libyuv/unit_test/scale_test.cc
+++ b/third_party/libyuv/unit_test/scale_test.cc
@@ -259,6 +259,123 @@ static int I420TestFilter_12(int src_width,
   return max_diff;
 }
 
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I420TestFilter_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+  align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+  align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+  if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+  uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+  uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_16[i] = src_y[i];
+  }
+  for (i = 0; i < src_uv_plane_size; ++i) {
+    p_src_u_16[i] = src_u[i];
+    p_src_v_16[i] = src_v[i];
+  }
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int dst_y_plane_size = (dst_width) * (dst_height);
+  int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
+
+  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+  uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+  uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+            dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I420Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+                 p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+                 dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+                 dst_stride_uv, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+    abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_u_8);
+  free_aligned_buffer_page_end(dst_v_8);
+  free_aligned_buffer_page_end(dst_y_16);
+  free_aligned_buffer_page_end(dst_u_16);
+  free_aligned_buffer_page_end(dst_v_16);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  free_aligned_buffer_page_end(src_y_16);
+  free_aligned_buffer_page_end(src_u_16);
+  free_aligned_buffer_page_end(src_v_16);
+
+  return max_diff;
+}
+
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int I444TestFilter(int src_width,
                           int src_height,
@@ -494,6 +611,123 @@ static int I444TestFilter_12(int src_width,
   return max_diff;
 }
 
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int src_width_uv = Abs(src_width);
+  int src_height_uv = Abs(src_height);
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+  align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+  align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+  if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+  uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+  uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_16[i] = src_y[i];
+  }
+  for (i = 0; i < src_uv_plane_size; ++i) {
+    p_src_u_16[i] = src_u[i];
+    p_src_v_16[i] = src_v[i];
+  }
+
+  int dst_width_uv = dst_width;
+  int dst_height_uv = dst_height;
+
+  int dst_y_plane_size = (dst_width) * (dst_height);
+  int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
+
+  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+  uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+  uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+            dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+                 p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+                 dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+                 dst_stride_uv, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+    abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_u_8);
+  free_aligned_buffer_page_end(dst_v_8);
+  free_aligned_buffer_page_end(dst_y_16);
+  free_aligned_buffer_page_end(dst_u_16);
+  free_aligned_buffer_page_end(dst_v_16);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  free_aligned_buffer_page_end(src_y_16);
+  free_aligned_buffer_page_end(src_u_16);
+  free_aligned_buffer_page_end(src_v_16);
+
+  return max_diff;
+}
+
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int NV12TestFilter(int src_width,
                           int src_height,
@@ -700,6 +934,20 @@ TEST_FACTOR(3, 1, 3, 0)
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##To##width##x##height##_##filter##_16) {       \
+    int diff = I420TestFilter_16(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##To##width##x##height##_##filter##_16) {       \
+    int diff = I444TestFilter_16(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
   TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) {      \
     int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width,     \
                               height, kFilter##filter, benchmark_iterations_, \
@@ -736,6 +984,22 @@ TEST_FACTOR(3, 1, 3, 0)
                                  benchmark_cpu_info_);                        \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##From##width##x##height##_##filter##_16) {     \
+    int diff = I420TestFilter_16(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##From##width##x##height##_##filter##_16) {     \
+    int diff = I444TestFilter_16(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
   TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) {    \
     int diff = NV12TestFilter(width, height, Abs(benchmark_width_),           \
                               Abs(benchmark_height_), kFilter##filter,        \
@@ -761,7 +1025,7 @@ TEST_FACTOR(3, 1, 3, 0)
 #endif
 
 TEST_SCALETO(Scale, 1, 1)
-TEST_SCALETO(Scale, 256, 144)  /* 128x72 * 2 */
+TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */
 TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 569, 480)
 TEST_SCALETO(Scale, 640, 360)
@@ -801,6 +1065,20 @@ TEST_SCALETO(Scale, 1920, 1080)
                                  disable_cpu_flags_, benchmark_cpu_info_); \
     EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) {   \
+    int diff = I420TestFilter_16(benchmark_width_, benchmark_height_,      \
+                                 benchmark_height_, benchmark_width_,      \
+                                 kFilter##filter, benchmark_iterations_,   \
+                                 disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) {   \
+    int diff = I444TestFilter_16(benchmark_width_, benchmark_height_,      \
+                                 benchmark_height_, benchmark_width_,      \
+                                 kFilter##filter, benchmark_iterations_,   \
+                                 disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
   TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) {                   \
     int diff = NV12TestFilter(benchmark_width_, benchmark_height_,         \
                               benchmark_height_, benchmark_width_,         \
diff --git a/third_party/libyuv/unit_test/scale_uv_test.cc b/third_party/libyuv/unit_test/scale_uv_test.cc
index 59eeee3043..6e4649f84d 100644
--- a/third_party/libyuv/unit_test/scale_uv_test.cc
+++ b/third_party/libyuv/unit_test/scale_uv_test.cc
@@ -166,7 +166,7 @@ TEST_FACTOR(3, 1, 3)
   TEST_SCALETO1(name, width, height, Bilinear, 3)
 
 TEST_SCALETO(UVScale, 1, 1)
-TEST_SCALETO(UVScale, 256, 144)  /* 128x72 * 2 */
+TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */
 TEST_SCALETO(UVScale, 320, 240)
 TEST_SCALETO(UVScale, 569, 480)
 TEST_SCALETO(UVScale, 640, 360)
diff --git a/third_party/libyuv/unit_test/unit_test.cc b/third_party/libyuv/unit_test/unit_test.cc
index 85e3b7170f..e6dbc3eed6 100644
--- a/third_party/libyuv/unit_test/unit_test.cc
+++ b/third_party/libyuv/unit_test/unit_test.cc
@@ -26,9 +26,13 @@ unsigned int fastrand_seed = 0xfb;
 ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image.");
 ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image.");
 ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test.");
-ABSL_FLAG(int32_t, libyuv_flags, 0,
+ABSL_FLAG(int32_t,
+          libyuv_flags,
+          0,
           "cpu flags for reference code. 1 = C, -1 = SIMD");
-ABSL_FLAG(int32_t, libyuv_cpu_info, 0,
+ABSL_FLAG(int32_t,
+          libyuv_cpu_info,
+          0,
           "cpu flags for benchmark code. 1 = C, -1 = SIMD");
 #else
 // Disable command line parameters if absl/flags disabled.
diff --git a/third_party/libyuv/unit_test/unit_test.h b/third_party/libyuv/unit_test/unit_test.h
index 87907fa160..580832addc 100644
--- a/third_party/libyuv/unit_test/unit_test.h
+++ b/third_party/libyuv/unit_test/unit_test.h
@@ -11,7 +11,7 @@
 #ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_
 
-#ifdef WIN32
+#ifdef _WIN32
 #include <windows.h>
 #else
 #include <sys/resource.h>
diff --git a/third_party/libyuv/unit_test/video_common_test.cc b/third_party/libyuv/unit_test/video_common_test.cc
index 6c6a384d41..36728ea900 100644
--- a/third_party/libyuv/unit_test/video_common_test.cc
+++ b/third_party/libyuv/unit_test/video_common_test.cc
@@ -29,7 +29,7 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) {
       !TestValidChar((fourcc >> 24) & 0xff)) {
     return false;
   }
-  if (bpp < 0 || bpp > 32) {
+  if (bpp < 0 || bpp > 64) {
     return false;
   }
   return true;
@@ -72,6 +72,8 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
   EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
   EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
   EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
   EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
   EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
   EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
diff --git a/third_party/opus/src/celt/celt_lpc.c b/third_party/opus/src/celt/celt_lpc.c
index 457e7ed0d2..242e6df55e 100644
--- a/third_party/opus/src/celt/celt_lpc.c
+++ b/third_party/opus/src/celt/celt_lpc.c
@@ -50,7 +50,11 @@ int          p
 #endif
 
    OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
    if (ac[0] != 0)
+#else
+   if (ac[0] > 1e-10f)
+#endif
    {
       for (i = 0; i < p; i++) {
          /* Sum up this iteration's reflection coefficient */
@@ -73,10 +77,10 @@ int          p
          error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
          /* Bail out once we get 30 dB gain */
 #ifdef FIXED_POINT
-         if (error<SHR32(ac[0],10))
+         if (error<=SHR32(ac[0],10))
             break;
 #else
-         if (error<.001f*ac[0])
+         if (error<=.001f*ac[0])
             break;
 #endif
       }
diff --git a/third_party/opus/src/src/opus_encoder.c b/third_party/opus/src/src/opus_encoder.c
index 321bb2bb1e..253fe9e880 100644
--- a/third_party/opus/src/src/opus_encoder.c
+++ b/third_party/opus/src/src/opus_encoder.c
@@ -900,10 +900,10 @@ static int decide_dtx_mode(opus_int activity,            /* indicates if this fr
 {
    if (!activity)
    {
-      /* The number of consecutive DTX frames should be within the allowed bounds. 
-      Note that the allowed bound is defined in the Silk headers and assumes 20 ms
-      frames. As this function can be called with any frame length, a conversion to
-      miliseconds is done before the comparisons. */
+      /* The number of consecutive DTX frames should be within the allowed bounds.
+         Note that the allowed bound is defined in the SILK headers and assumes 20 ms
+         frames. As this function can be called with any frame length, a conversion to
+         milliseconds is done before the comparisons. */
       (*nb_no_activity_ms_Q1) += frame_size_ms_Q1;
       if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2)
       {
diff --git a/third_party/pffft/pffft_unittest.cc b/third_party/pffft/pffft_unittest.cc
index 559723434e..c2bf184191 100644
--- a/third_party/pffft/pffft_unittest.cc
+++ b/third_party/pffft/pffft_unittest.cc
@@ -68,7 +68,7 @@ void PffftValidate(int fft_size, bool complex_fft) {
     }
 
     for (k = 0; k < num_floats; ++k) {
-      ref_max = std::max(ref_max, fabs(ref[k]));
+      ref_max = std::max<float>(ref_max, (float) fabs(ref[k]));
     }
 
     // Pass 0: non canonical ordering of transform coefficients.
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h
index b8e2f2d581..be2028eb27 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h
@@ -34,7 +34,7 @@
 
 #if defined(__FreeBSD__) && !defined(__Userspace__)
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_constants.h 365071 2020-09-01 21:19:14Z mjg $");
+__FBSDID("$FreeBSD$");
 #endif
 
 #ifndef _NETINET_SCTP_CONSTANTS_H_
@@ -610,7 +610,7 @@ extern void getwintimeofday(struct timeval *tv);
 
 #define SCTP_RTO_UPPER_BOUND	(60000)	/* 60 sec in ms */
 #define SCTP_RTO_LOWER_BOUND	(1000)	/* 1 sec is ms */
-#define SCTP_RTO_INITIAL	(3000)	/* 3 sec in ms */
+#define SCTP_RTO_INITIAL	(1000)	/* 1 sec in ms */
 
 #define SCTP_INP_KILL_TIMEOUT 20	/* number of ms to retry kill of inpcb */
 #define SCTP_ASOC_KILL_TIMEOUT 10	/* number of ms to retry kill of inpcb */
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c
index f3c3644855..fb6e4c23eb 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c
@@ -108,57 +108,12 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
 	if (stcb == NULL) {
 		SCTP_INP_RLOCK(inp);
 	}
-	/* validate length */
-	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) {
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-				       vrf_id, port);
-		if (stcb)
-			*abort_no_unlock = 1;
-		goto outnow;
-	}
-	/* validate parameters */
+	/* Validate parameters */
 	init = &cp->init;
-	if (init->initiate_tag == 0) {
-		/* protocol error... send abort */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-				       vrf_id, port);
-		if (stcb)
-			*abort_no_unlock = 1;
-		goto outnow;
-	}
-	if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) {
-		/* invalid parameter... send abort */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-				       vrf_id, port);
-		if (stcb)
-			*abort_no_unlock = 1;
-		goto outnow;
-	}
-	if (init->num_inbound_streams == 0) {
-		/* protocol error... send abort */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-				       vrf_id, port);
-		if (stcb)
-			*abort_no_unlock = 1;
-		goto outnow;
-	}
-	if (init->num_outbound_streams == 0) {
+	if ((ntohl(init->initiate_tag) == 0) ||
+	    (ntohl(init->a_rwnd) < SCTP_MIN_RWND) ||
+	    (ntohs(init->num_inbound_streams) == 0) ||
+	    (ntohs(init->num_outbound_streams) == 0)) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
@@ -514,26 +469,34 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
 	asoc = &stcb->asoc;
 	asoc->peer_supports_nat = (uint8_t)nat_friendly;
 	/* process the peer's parameters in the INIT-ACK */
-	retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb);
-	if (retval < 0) {
+	if (sctp_process_init((struct sctp_init_chunk *)cp, stcb) < 0) {
 		if (op_err != NULL) {
 			sctp_m_freem(op_err);
 		}
-		return (retval);
+		op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
+		SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n");
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+		                       mflowtype, mflowid,
+#endif
+		                       vrf_id, net->port);
+		*abort_no_unlock = 1;
+		return (-1);
 	}
 	initack_limit = offset + ntohs(cp->ch.chunk_length);
 	/* load all addresses */
 	if ((retval = sctp_load_addresses_from_init(stcb, m,
-	    (offset + sizeof(struct sctp_init_chunk)), initack_limit,
-	    src, dst, NULL, stcb->asoc.port))) {
+	                                            offset + sizeof(struct sctp_init_chunk),
+	                                            initack_limit, src, dst, NULL, stcb->asoc.port)) < 0) {
 		if (op_err != NULL) {
 			sctp_m_freem(op_err);
 		}
 		op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 		                             "Problem with address parameters");
 		SCTPDBG(SCTP_DEBUG_INPUT1,
-			"Load addresses from INIT causes an abort %d\n",
-			retval);
+		        "Load addresses from INIT causes an abort %d\n",
+		        retval);
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		                       src, dst, sh, op_err,
 #if defined(__FreeBSD__) && !defined(__Userspace__)
@@ -1420,57 +1383,12 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
 			"sctp_handle_init_ack: TCB is null\n");
 		return (-1);
 	}
-	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) {
-		/* Invalid length */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
-		                       src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-		                       vrf_id, net->port);
-		*abort_no_unlock = 1;
-		return (-1);
-	}
 	init_ack = &cp->init;
-	/* validate parameters */
-	if (init_ack->initiate_tag == 0) {
-		/* protocol error... send an abort */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
-		                       src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-		                       vrf_id, net->port);
-		*abort_no_unlock = 1;
-		return (-1);
-	}
-	if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) {
-		/* protocol error... send an abort */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
-		                       src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-		                       vrf_id, net->port);
-		*abort_no_unlock = 1;
-		return (-1);
-	}
-	if (init_ack->num_inbound_streams == 0) {
-		/* protocol error... send an abort */
-		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
-		                       src, dst, sh, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-		                       mflowtype, mflowid,
-#endif
-		                       vrf_id, net->port);
-		*abort_no_unlock = 1;
-		return (-1);
-	}
-	if (init_ack->num_outbound_streams == 0) {
+	/* Validate parameters. */
+	if ((ntohl(init_ack->initiate_tag) == 0) ||
+	    (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) ||
+	    (ntohs(init_ack->num_inbound_streams) == 0) ||
+	    (ntohs(init_ack->num_outbound_streams) == 0)) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
@@ -1624,6 +1542,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		                   vrf_id, net->port);
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 2;
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	/*
@@ -1638,9 +1557,11 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 			      (uint8_t *) & init_buf);
 	if (init_cp == NULL) {
 		/* could not pull a INIT chunk in cookie */
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	if (init_cp->ch.chunk_type != SCTP_INITIATION) {
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	/*
@@ -1653,9 +1574,11 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 			      (uint8_t *) & initack_buf);
 	if (initack_cp == NULL) {
 		/* could not pull INIT-ACK chunk in cookie */
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) {
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
@@ -1681,6 +1604,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 			 */
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 17;
+			SCTP_TCB_UNLOCK(stcb);
 			return (NULL);
 		}
 		switch (SCTP_GET_STATE(stcb)) {
@@ -1693,10 +1617,17 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 				 * have the right seq no's.
 				 */
 				/* First we must process the INIT !! */
-				retval = sctp_process_init(init_cp, stcb);
-				if (retval < 0) {
+				if (sctp_process_init(init_cp, stcb) < 0) {
 					if (how_indx < sizeof(asoc->cookie_how))
 						asoc->cookie_how[how_indx] = 3;
+					op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
+					SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n");
+					sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+					                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+					                       mflowtype, mflowid,
+#endif
+					                       vrf_id, net->port);
 					return (NULL);
 				}
 				/* we have already processed the INIT so no problem */
@@ -1741,6 +1672,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 					SCTP_TCB_LOCK(stcb);
 					atomic_add_int(&stcb->asoc.refcnt, -1);
 					if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+						SCTP_TCB_UNLOCK(stcb);
 						SCTP_SOCKET_UNLOCK(so, 1);
 						return (NULL);
 					}
@@ -1776,16 +1708,22 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 				break;
 		}	/* end switch */
 		sctp_stop_all_cookie_timers(stcb);
-		/*
-		 * We ignore the return code here.. not sure if we should
-		 * somehow abort.. but we do have an existing asoc. This
-		 * really should not fail.
-		 */
-		if (sctp_load_addresses_from_init(stcb, m,
-			                          init_offset + sizeof(struct sctp_init_chunk),
-			                          initack_offset, src, dst, init_src, stcb->asoc.port)) {
+		if ((retval = sctp_load_addresses_from_init(stcb, m,
+			                                    init_offset + sizeof(struct sctp_init_chunk),
+			                                    initack_offset, src, dst, init_src, stcb->asoc.port)) < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 4;
+			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+			                             "Problem with address parameters");
+			SCTPDBG(SCTP_DEBUG_INPUT1,
+			        "Load addresses from INIT causes an abort %d\n",
+			        retval);
+			sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+			                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+			                       mflowtype, mflowid,
+#endif
+			                       vrf_id, net->port);
 			return (NULL);
 		}
 		/* respond with a COOKIE-ACK */
@@ -1805,6 +1743,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		 */
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 6;
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	/* If nat support, and the below and stcb is established,
@@ -1830,6 +1769,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		                mflowtype, mflowid, inp->fibnum,
 #endif
 		                vrf_id, port);
+		SCTP_TCB_UNLOCK(stcb);
 		return (NULL);
 	}
 	if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
@@ -1859,6 +1799,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 7;
 
+			SCTP_TCB_UNLOCK(stcb);
 			return (NULL);
 		}
 		if (how_indx < sizeof(asoc->cookie_how))
@@ -1901,17 +1842,35 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 			}
 		}
 		/* process the INIT info (peer's info) */
-		retval = sctp_process_init(init_cp, stcb);
-		if (retval < 0) {
+		if (sctp_process_init(init_cp, stcb) < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 9;
+			op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
+			SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n");
+			sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+			                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+			                       mflowtype, mflowid,
+#endif
+			                       vrf_id, net->port);
 			return (NULL);
 		}
-		if (sctp_load_addresses_from_init(stcb, m,
-		                                  init_offset + sizeof(struct sctp_init_chunk),
-		                                  initack_offset, src, dst, init_src, stcb->asoc.port)) {
+		if ((retval = sctp_load_addresses_from_init(stcb, m,
+		                                            init_offset + sizeof(struct sctp_init_chunk),
+		                                            initack_offset, src, dst, init_src, stcb->asoc.port)) < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 10;
+			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+			                             "Problem with address parameters");
+			SCTPDBG(SCTP_DEBUG_INPUT1,
+			        "Load addresses from INIT causes an abort %d\n",
+			        retval);
+			sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+			                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+			                       mflowtype, mflowid,
+#endif
+			                       vrf_id, net->port);
 			return (NULL);
 		}
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
@@ -1933,6 +1892,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 				SCTP_TCB_LOCK(stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 				if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+					SCTP_TCB_UNLOCK(stcb);
 					SCTP_SOCKET_UNLOCK(so, 1);
 					return (NULL);
 				}
@@ -1985,19 +1945,25 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 #endif
 
 		if (asoc->peer_supports_nat) {
+			struct sctp_tcb *local_stcb;
+
 			/* This is a gross gross hack.
 			 * Just call the cookie_new code since we
 			 * are allowing a duplicate association.
 			 * I hope this works...
 			 */
-			return (sctp_process_cookie_new(m, iphlen, offset, src, dst,
-			                                sh, cookie, cookie_len,
-			                                inp, netp, init_src,notification,
-			                                auth_skipped, auth_offset, auth_len,
+			local_stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst,
+			                                     sh, cookie, cookie_len,
+			                                     inp, netp, init_src,notification,
+			                                     auth_skipped, auth_offset, auth_len,
 #if defined(__FreeBSD__) && !defined(__Userspace__)
-			                                mflowtype, mflowid,
+			                                     mflowtype, mflowid,
 #endif
-			                                vrf_id, port));
+			                                     vrf_id, port);
+			if (local_stcb == NULL) {
+				SCTP_TCB_UNLOCK(stcb);
+			}
+			return (local_stcb);
 		}
 		/*
 		 * case A in Section 5.2.4 Table 2: XXMM (peer restarted)
@@ -2005,11 +1971,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		/* temp code */
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 12;
-		sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net,
-		                SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
-		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
-		                SCTP_FROM_SCTP_INPUT + SCTP_LOC_17);
-
+		sctp_stop_association_timers(stcb, false);
 		/* notify upper layer */
 		*notification = SCTP_NOTIFY_ASSOC_RESTART;
 		atomic_add_int(&stcb->asoc.refcnt, 1);
@@ -2042,6 +2004,10 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		asoc->str_reset_seq_in = asoc->init_seq_number;
 		asoc->advanced_peer_ack_point = asoc->last_acked_seq;
 		asoc->send_sack = 1;
+		asoc->data_pkts_seen = 0;
+		asoc->last_data_chunk_from = NULL;
+		asoc->last_control_chunk_from = NULL;
+		asoc->last_net_cmt_send_started = NULL;
 		if (asoc->mapping_array) {
 			memset(asoc->mapping_array, 0,
 			       asoc->mapping_array_size);
@@ -2106,6 +2072,9 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 			SCTP_DECR_CHK_COUNT();
 		}
+		asoc->ctrl_queue_cnt = 0;
+		asoc->str_reset = NULL;
+		asoc->stream_reset_outstanding = 0;
 		TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) {
 			TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next);
 			if (chk->data) {
@@ -2154,11 +2123,17 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		asoc->total_flight = 0;
 		asoc->total_flight_count = 0;
 		/* process the INIT info (peer's info) */
-		retval = sctp_process_init(init_cp, stcb);
-		if (retval < 0) {
+		if (sctp_process_init(init_cp, stcb) < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 13;
-
+			op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
+			SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n");
+			sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+			                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+			                       mflowtype, mflowid,
+#endif
+			                       vrf_id, net->port);
 			return (NULL);
 		}
 		/*
@@ -2167,26 +2142,38 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
 		 */
 		net->hb_responded = 1;
 
-		if (sctp_load_addresses_from_init(stcb, m,
-		                                  init_offset + sizeof(struct sctp_init_chunk),
-		                                  initack_offset, src, dst, init_src, stcb->asoc.port)) {
+		if ((retval = sctp_load_addresses_from_init(stcb, m,
+		                                            init_offset + sizeof(struct sctp_init_chunk),
+		                                            initack_offset, src, dst, init_src, stcb->asoc.port)) < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 14;
-
+			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+			                             "Problem with address parameters");
+			SCTPDBG(SCTP_DEBUG_INPUT1,
+			        "Load addresses from INIT causes an abort %d\n",
+			        retval);
+			sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+			                       src, dst, sh, op_err,
+#if defined(__FreeBSD__) && !defined(__Userspace__)
+			                       mflowtype, mflowid,
+#endif
+			                       vrf_id, net->port);
 			return (NULL);
 		}
 		/* respond with a COOKIE-ACK */
-		sctp_stop_all_cookie_timers(stcb);
-		sctp_toss_old_cookies(stcb, asoc);
 		sctp_send_cookie_ack(stcb);
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 15;
-
+		if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE) &&
+		    (asoc->sctp_autoclose_ticks > 0)) {
+			sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL);
+		}
 		return (stcb);
 	}
 	if (how_indx < sizeof(asoc->cookie_how))
 		asoc->cookie_how[how_indx] = 16;
 	/* all other cases... */
+	SCTP_TCB_UNLOCK(stcb);
 	return (NULL);
 }
 
@@ -2360,8 +2347,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
 	asoc->advanced_peer_ack_point = asoc->last_acked_seq;
 
 	/* process the INIT info (peer's info) */
-	retval = sctp_process_init(init_cp, stcb);
-	if (retval < 0) {
+	if (sctp_process_init(init_cp, stcb) < 0) {
 #if defined(__APPLE__) && !defined(__Userspace__)
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -2377,9 +2363,9 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
 		return (NULL);
 	}
 	/* load all addresses */
-	if (sctp_load_addresses_from_init(stcb, m,
-	    init_offset + sizeof(struct sctp_init_chunk), initack_offset,
-	    src, dst, init_src, port)) {
+	if ((retval = sctp_load_addresses_from_init(stcb, m,
+	                                            init_offset + sizeof(struct sctp_init_chunk),
+	                                            initack_offset, src, dst, init_src, port)) < 0) {
 #if defined(__APPLE__) && !defined(__Userspace__)
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -2956,12 +2942,15 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
 		had_a_existing_tcb = 1;
 		*stcb = sctp_process_cookie_existing(m, iphlen, offset,
 		                                     src, dst, sh,
-						     cookie, cookie_len, *inp_p, *stcb, netp, to,
-						     &notification, auth_skipped, auth_offset, auth_len,
+		                                     cookie, cookie_len, *inp_p, *stcb, netp, to,
+		                                     &notification, auth_skipped, auth_offset, auth_len,
 #if defined(__FreeBSD__) && !defined(__Userspace__)
 		                                     mflowtype, mflowid,
 #endif
 		                                     vrf_id, port);
+		if (*stcb == NULL) {
+			*locked_tcb = NULL;
+		}
 	}
 
 	if (*stcb == NULL) {
@@ -3847,19 +3836,16 @@ sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chu
 	int len, clen;
 
 	asoc = &stcb->asoc;
-	if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
-		asoc->stream_reset_outstanding = 0;
-		return (NULL);
-	}
-	if (stcb->asoc.str_reset == NULL) {
+	chk = asoc->str_reset;
+	if (TAILQ_EMPTY(&asoc->control_send_queue) ||
+	    (chk == NULL)) {
 		asoc->stream_reset_outstanding = 0;
 		return (NULL);
 	}
-	chk = stcb->asoc.str_reset;
 	if (chk->data == NULL) {
 		return (NULL);
 	}
-	if (bchk) {
+	if (bchk != NULL) {
 		/* he wants a copy of the chk pointer */
 		*bchk = chk;
 	}
@@ -4798,6 +4784,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 	int ret;
 	int abort_no_unlock = 0;
 	int ecne_seen = 0;
+	int abort_flag;
 	/*
 	 * How big should this be, and should it be alloc'd? Lets try the
 	 * d-mtu-ceiling for now (2k) and that should hopefully work ...
@@ -4962,29 +4949,6 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 				}
 				return (NULL);
 			}
-		} else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
-			if (vtag_in != asoc->my_vtag) {
-				/*
-				 * this could be a stale SHUTDOWN-ACK or the
-				 * peer never got the SHUTDOWN-COMPLETE and
-				 * is still hung; we have started a new asoc
-				 * but it won't complete until the shutdown
-				 * is completed
-				 */
-				if (stcb != NULL) {
-					SCTP_TCB_UNLOCK(stcb);
-				}
-				SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
-				op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
-				                             msg);
-				sctp_handle_ootb(m, iphlen, *offset, src, dst,
-				                 sh, inp, op_err,
-#if defined(__FreeBSD__) && !defined(__Userspace__)
-				                 mflowtype, mflowid, fibnum,
-#endif
-				                 vrf_id, port);
-				return (NULL);
-			}
 		} else {
 			/* for all other chunks, vtag must match */
 			if (vtag_in != asoc->my_vtag) {
@@ -5047,10 +5011,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 		                                           chunk_buf);
 		if (ch == NULL) {
 			*offset = length;
-			if (stcb != NULL) {
-				SCTP_TCB_UNLOCK(stcb);
-			}
-			return (NULL);
+			return (stcb);
 		}
 
 		num_chunks++;
@@ -5084,12 +5045,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			/* The INIT chunk must be the only chunk. */
 			if ((num_chunks > 1) ||
 			    (length - *offset > (int)SCTP_SIZE32(chk_length))) {
-				/* RFC 4960 requires that no ABORT is sent */
+				/*
+				 * RFC 4960bis requires stopping the
+				 * processing of the packet.
+				 */
 				*offset = length;
-				if (stcb != NULL) {
-					SCTP_TCB_UNLOCK(stcb);
-				}
-				return (NULL);
+				return (stcb);
 			}
 			/* Honor our resource limit. */
 			if (chk_length > SCTP_LARGEST_INIT_ACCEPTED) {
@@ -5296,20 +5257,19 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				SCTP_STAT_INCR(sctps_recvheartbeat);
 				sctp_send_heartbeat_ack(stcb, m, *offset,
-							chk_length, *netp);
+				                        chk_length, *netp);
 			}
 			break;
 		case SCTP_HEARTBEAT_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT_ACK\n");
 			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) {
 				/* Its not ours */
-				*offset = length;
-				return (stcb);
+				break;
 			}
 			SCTP_STAT_INCR(sctps_recvheartbeatack);
 			if ((netp != NULL) && (*netp != NULL)) {
 				sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch,
-							  stcb, *netp);
+				                          stcb, *netp);
 			}
 			break;
 		case SCTP_ABORT_ASSOCIATION:
@@ -5330,14 +5290,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n",
 				(void *)stcb);
 			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) {
-				*offset = length;
-				return (stcb);
+				break;
 			}
 			if ((netp != NULL) && (*netp != NULL)) {
-				int abort_flag = 0;
-
+				abort_flag = 0;
 				sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch,
-						     stcb, *netp, &abort_flag);
+				                     stcb, *netp, &abort_flag);
 				if (abort_flag) {
 					*offset = length;
 					return (NULL);
@@ -5346,11 +5304,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			break;
 		case SCTP_SHUTDOWN_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN_ACK, stcb %p\n", (void *)stcb);
-			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
+			if ((chk_length == sizeof(struct sctp_shutdown_ack_chunk)) &&
+			    (stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp);
+				*offset = length;
+				return (NULL);
 			}
-			*offset = length;
-			return (NULL);
 			break;
 		case SCTP_OPERATION_ERROR:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP_ERR\n");
@@ -5494,7 +5453,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 		case SCTP_COOKIE_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE_ACK, stcb %p\n", (void *)stcb);
 			if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) {
-				return (stcb);
+				break;
 			}
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 				/* We are not interested anymore */
@@ -5524,26 +5483,29 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			break;
 		case SCTP_ECN_ECHO:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_ECHO\n");
-			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) {
-				/* Its not ours */
-				*offset = length;
-				return (stcb);
+			if (stcb == NULL) {
+				break;
 			}
 			if (stcb->asoc.ecn_supported == 0) {
 				goto unknown_chunk;
 			}
+			if (chk_length != sizeof(struct sctp_ecne_chunk)) {
+				break;
+			}
 			sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, stcb);
 			ecne_seen = 1;
 			break;
 		case SCTP_ECN_CWR:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_CWR\n");
-			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) {
-				*offset = length;
-				return (stcb);
+			if (stcb == NULL) {
+				break;
 			}
 			if (stcb->asoc.ecn_supported == 0) {
 				goto unknown_chunk;
 			}
+			if (chk_length != sizeof(struct sctp_cwr_chunk)) {
+				break;
+			}
 			sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb, *netp);
 			break;
 		case SCTP_SHUTDOWN_COMPLETE:
@@ -5554,12 +5516,13 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 				*offset = length;
 				return (stcb);
 			}
-			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
+			if ((chk_length == sizeof(struct sctp_shutdown_complete_chunk)) &&
+			    (stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch,
-							      stcb, *netp);
+				                              stcb, *netp);
+				*offset = length;
+				return (NULL);
 			}
-			*offset = length;
-			return (NULL);
 			break;
 		case SCTP_ASCONF:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n");
@@ -5568,32 +5531,33 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 					goto unknown_chunk;
 				}
 				sctp_handle_asconf(m, *offset, src,
-						   (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0);
+				                   (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0);
 				asconf_cnt++;
 			}
 			break;
 		case SCTP_ASCONF_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF_ACK\n");
+			if (stcb == NULL) {
+				break;
+			}
+			if (stcb->asoc.asconf_supported == 0) {
+				goto unknown_chunk;
+			}
 			if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) {
-				/* Its not ours */
-				*offset = length;
-				return (stcb);
+				break;
 			}
-			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
-				if (stcb->asoc.asconf_supported == 0) {
-					goto unknown_chunk;
-				}
+			if ((netp != NULL) && (*netp != NULL)) {
 				/* He's alive so give him credit */
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 					sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
-						       stcb->asoc.overall_error_count,
-						       0,
-						       SCTP_FROM_SCTP_INPUT,
-						       __LINE__);
+					               stcb->asoc.overall_error_count,
+					               0,
+					               SCTP_FROM_SCTP_INPUT,
+					               __LINE__);
 				}
 				stcb->asoc.overall_error_count = 0;
 				sctp_handle_asconf_ack(m, *offset,
-						       (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock);
+				                       (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock);
 				if (abort_no_unlock)
 					return (NULL);
 			}
@@ -5602,72 +5566,70 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 		case SCTP_IFORWARD_CUM_TSN:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "%s\n",
 				ch->chunk_type == SCTP_FORWARD_CUM_TSN ? "FORWARD_TSN" : "I_FORWARD_TSN");
+			if (stcb == NULL) {
+				break;
+			}
+			if (stcb->asoc.prsctp_supported == 0) {
+				goto unknown_chunk;
+			}
 			if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) {
-				/* Its not ours */
-				*offset = length;
-				return (stcb);
+				break;
 			}
-
-			if (stcb != NULL) {
-				int abort_flag = 0;
-
-				if (stcb->asoc.prsctp_supported == 0) {
-					goto unknown_chunk;
-				}
-				if (((stcb->asoc.idata_supported == 1) && (ch->chunk_type == SCTP_FORWARD_CUM_TSN)) ||
-				    ((stcb->asoc.idata_supported == 0) && (ch->chunk_type == SCTP_IFORWARD_CUM_TSN))) {
-					if (ch->chunk_type == SCTP_FORWARD_CUM_TSN) {
-						SCTP_SNPRINTF(msg, sizeof(msg), "%s", "FORWARD-TSN chunk received when I-FORWARD-TSN was negotiated");
-					} else {
-						SCTP_SNPRINTF(msg, sizeof(msg), "%s", "I-FORWARD-TSN chunk received when FORWARD-TSN was negotiated");
-					}
-					op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
-					sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
-					*offset = length;
-					return (NULL);
+			if (((stcb->asoc.idata_supported == 1) && (ch->chunk_type == SCTP_FORWARD_CUM_TSN)) ||
+			    ((stcb->asoc.idata_supported == 0) && (ch->chunk_type == SCTP_IFORWARD_CUM_TSN))) {
+				if (ch->chunk_type == SCTP_FORWARD_CUM_TSN) {
+					SCTP_SNPRINTF(msg, sizeof(msg), "%s", "FORWARD-TSN chunk received when I-FORWARD-TSN was negotiated");
+				} else {
+					SCTP_SNPRINTF(msg, sizeof(msg), "%s", "I-FORWARD-TSN chunk received when FORWARD-TSN was negotiated");
 				}
-				*fwd_tsn_seen = 1;
-				if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
-					/* We are not interested anymore */
+				op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+				sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
+				*offset = length;
+				return (NULL);
+			}
+			*fwd_tsn_seen = 1;
+			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+				/* We are not interested anymore */
 #if defined(__APPLE__) && !defined(__Userspace__)
-					so = SCTP_INP_SO(inp);
-					atomic_add_int(&stcb->asoc.refcnt, 1);
-					SCTP_TCB_UNLOCK(stcb);
-					SCTP_SOCKET_LOCK(so, 1);
-					SCTP_TCB_LOCK(stcb);
-					atomic_subtract_int(&stcb->asoc.refcnt, 1);
+				so = SCTP_INP_SO(inp);
+				atomic_add_int(&stcb->asoc.refcnt, 1);
+				SCTP_TCB_UNLOCK(stcb);
+				SCTP_SOCKET_LOCK(so, 1);
+				SCTP_TCB_LOCK(stcb);
+				atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
-					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
-					                      SCTP_FROM_SCTP_INPUT + SCTP_LOC_31);
+				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+				                      SCTP_FROM_SCTP_INPUT + SCTP_LOC_31);
 #if defined(__APPLE__) && !defined(__Userspace__)
-					SCTP_SOCKET_UNLOCK(so, 1);
+				SCTP_SOCKET_UNLOCK(so, 1);
 #endif
-					*offset = length;
-					return (NULL);
-				}
-				/*
-				 * For sending a SACK this looks like DATA
-				 * chunks.
-				 */
-				stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from;
-				sctp_handle_forward_tsn(stcb,
-							(struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset);
-				if (abort_flag) {
-					*offset = length;
-					return (NULL);
-				}
+				*offset = length;
+				return (NULL);
+			}
+			/*
+			 * For sending a SACK this looks like DATA
+			 * chunks.
+			 */
+			stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from;
+			abort_flag = 0;
+			sctp_handle_forward_tsn(stcb,
+			                        (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset);
+			if (abort_flag) {
+				*offset = length;
+				return (NULL);
 			}
 			break;
 		case SCTP_STREAM_RESET:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n");
-			if ((stcb == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req))) {
-				/* Its not ours */
-				*offset = length;
-				return (stcb);
+			if (stcb == NULL) {
+				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				goto unknown_chunk;
 			}
+			if (chk_length < sizeof(struct sctp_stream_reset_tsn_req)) {
+				break;
+			}
 			if (sctp_handle_stream_reset(stcb, m, *offset, ch)) {
 				/* stop processing */
 				*offset = length;
@@ -5676,20 +5638,19 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			break;
 		case SCTP_PACKET_DROPPED:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n");
-			/* re-get it all please */
+			if (stcb == NULL) {
+				break;
+			}
+			if (stcb->asoc.pktdrop_supported == 0) {
+				goto unknown_chunk;
+			}
 			if (chk_length < sizeof(struct sctp_pktdrop_chunk)) {
-				/* Its not ours */
-				*offset = length;
-				return (stcb);
+				break;
 			}
-
-			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
-				if (stcb->asoc.pktdrop_supported == 0) {
-					goto unknown_chunk;
-				}
+			if ((netp != NULL) && (*netp != NULL)) {
 				sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch,
-							   stcb, *netp,
-							   min(chk_length, contiguous));
+				                           stcb, *netp,
+				                           min(chk_length, contiguous));
 			}
 			break;
 		case SCTP_AUTHENTICATION:
@@ -5702,21 +5663,21 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 					auth_skipped = 1;
 				}
 				/* skip this chunk (temporarily) */
-				goto next_chunk;
+				break;
 			}
 			if (stcb->asoc.auth_supported == 0) {
 				goto unknown_chunk;
 			}
 			if ((chk_length < (sizeof(struct sctp_auth_chunk))) ||
 			    (chk_length > (sizeof(struct sctp_auth_chunk) +
-					   SCTP_AUTH_DIGEST_LEN_MAX))) {
+			                  SCTP_AUTH_DIGEST_LEN_MAX))) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 			if (got_auth == 1) {
 				/* skip this chunk... it's already auth'd */
-				goto next_chunk;
+				break;
 			}
 			got_auth = 1;
 			if (sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, m, *offset)) {
@@ -5777,7 +5738,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
 			break;
 		}
 		ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
-							   sizeof(struct sctp_chunkhdr), chunk_buf);
+		                                           sizeof(struct sctp_chunkhdr), chunk_buf);
 		if (ch == NULL) {
 			*offset = length;
 			return (stcb);
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h
index 6c3348ad9f..46b618110c 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h
@@ -886,7 +886,7 @@ int sctp_userspace_get_mtu_from_ifn(uint32_t if_index, int af);
 
 #define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((rt != NULL) ? rt->rt_rmx.rmx_mtu : 0)
 
-#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn)  sctp_userspace_get_mtu_from_ifn(if_nametoindex(((struct ifaddrs *) (sctp_ifn))->ifa_name), AF_INET)
+#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) (sctp_ifn->ifn_mtu)
 
 #define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \
                                               if (rt != NULL) \
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c
index d30019b22a..d1e84daad5 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c
@@ -7533,7 +7533,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
 			break;
 		}
 		phdr = sctp_get_next_param(m, offset, &param_buf,
-					   sizeof(param_buf));
+		                           sizeof(param_buf));
 	}
 	/* Now check to see if we need to purge any addresses */
 	TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) {
@@ -7543,11 +7543,15 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
 			/* remove and free it */
 			stcb->asoc.numnets--;
 			TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next);
-			sctp_free_remote_addr(net);
+			if (net == stcb->asoc.alternate) {
+				sctp_free_remote_addr(stcb->asoc.alternate);
+				stcb->asoc.alternate = NULL;
+			}
 			if (net == stcb->asoc.primary_destination) {
 				stcb->asoc.primary_destination = NULL;
 				sctp_select_primary_destination(stcb);
 			}
+			sctp_free_remote_addr(net);
 		}
 	}
 	if ((stcb->asoc.ecn_supported == 1) &&
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c
index db0e7533ff..8472c3a1c0 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c
@@ -80,7 +80,25 @@ sctp_sha1_final(unsigned char *digest, struct sctp_sha1_context *ctx)
 {
 	SHA1_Final(digest, &ctx->sha_ctx);
 }
+#elif defined(SCTP_USE_MBEDTLS_SHA1)
+void
+sctp_sha1_init(struct sctp_sha1_context *ctx)
+{
+	mbedtls_sha1_init(&ctx->sha1_ctx);
+	mbedtls_sha1_starts_ret(&ctx->sha1_ctx);
+}
 
+void
+sctp_sha1_update(struct sctp_sha1_context *ctx, const unsigned char *ptr, unsigned int siz)
+{
+	mbedtls_sha1_update_ret(&ctx->sha1_ctx, ptr, siz);
+}
+
+void
+sctp_sha1_final(unsigned char *digest, struct sctp_sha1_context *ctx)
+{
+	mbedtls_sha1_finish_ret(&ctx->sha1_ctx, digest);
+}
 #else
 
 #include <string.h>
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h
index d535ee4639..9ff4ff7bdc 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h
@@ -46,6 +46,8 @@ __FBSDID("$FreeBSD$");
 #include <pk11pub.h>
 #elif defined(SCTP_USE_OPENSSL_SHA1)
 #include <openssl/sha.h>
+#elif defined(SCTP_USE_MBEDTLS_SHA1)
+#include <mbedtls/sha1.h>
 #endif
 
 struct sctp_sha1_context {
@@ -53,6 +55,8 @@ struct sctp_sha1_context {
 	struct PK11Context *pk11_ctx;
 #elif defined(SCTP_USE_OPENSSL_SHA1)
 	SHA_CTX sha_ctx;
+#elif defined(SCTP_USE_MBEDTLS_SHA1)
+	mbedtls_sha1_context sha1_ctx;
 #else
 	unsigned int A;
 	unsigned int B;
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c
index ba64aaff77..41aff19e08 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c
@@ -98,23 +98,42 @@ sctp_userspace_set_threadname(const char *name)
 int
 sctp_userspace_get_mtu_from_ifn(uint32_t if_index, int af)
 {
+#if defined(INET) || defined(INET6)
 	struct ifreq ifr;
 	int fd;
+#endif
+	int mtu;
 
-	memset(&ifr, 0, sizeof(struct ifreq));
-	if (if_indextoname(if_index, ifr.ifr_name) != NULL) {
-		/* TODO can I use the raw socket here and not have to open a new one with each query? */
-		if ((fd = socket(af, SOCK_DGRAM, 0)) < 0)
-			return (0);
-		if (ioctl(fd, SIOCGIFMTU, &ifr) < 0) {
+	switch (af) {
+#if defined(INET)
+	case AF_INET:
+#endif
+#if defined(INET6)
+	case AF_INET6:
+#endif
+#if defined(INET) || defined(INET6)
+		memset(&ifr, 0, sizeof(struct ifreq));
+		mtu = 0;
+		if (if_indextoname(if_index, ifr.ifr_name) != NULL) {
+			/* TODO can I use the raw socket here and not have to open a new one with each query? */
+			if ((fd = socket(af, SOCK_DGRAM, 0)) < 0) {
+				break;
+			}
+			if (ioctl(fd, SIOCGIFMTU, &ifr) >= 0) {
+				mtu = ifr.ifr_mtu;
+			}
 			close(fd);
-			return (0);
 		}
-		close(fd);
-		return ifr.ifr_mtu;
-	} else {
-		return (0);
+		break;
+#endif
+	case AF_CONN:
+		mtu = 1280;
+		break;
+	default:
+		mtu = 0;
+		break;
 	}
+	return (mtu);
 }
 #endif
 
@@ -143,41 +162,60 @@ timingsafe_bcmp(const void *b1, const void *b2, size_t n)
 int
 sctp_userspace_get_mtu_from_ifn(uint32_t if_index, int af)
 {
+#if defined(INET) || defined(INET6)
 	PIP_ADAPTER_ADDRESSES pAdapterAddrs, pAdapt;
 	DWORD AdapterAddrsSize, Err;
-	int ret;
+#endif
+	int mtu;
 
-	ret = 0;
-	AdapterAddrsSize = 0;
-	pAdapterAddrs = NULL;
-	if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, NULL, &AdapterAddrsSize)) != 0) {
-		if ((Err != ERROR_BUFFER_OVERFLOW) && (Err != ERROR_INSUFFICIENT_BUFFER)) {
-			SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() sizing failed with error code %d, AdapterAddrsSize = %d\n", Err, AdapterAddrsSize);
-			ret = -1;
+	switch (af) {
+#if defined(INET)
+	case AF_INET:
+#endif
+#if defined(INET6)
+	case AF_INET6:
+#endif
+#if defined(INET) || defined(INET6)
+		mtu = 0;
+		AdapterAddrsSize = 0;
+		pAdapterAddrs = NULL;
+		if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, NULL, &AdapterAddrsSize)) != 0) {
+			if ((Err != ERROR_BUFFER_OVERFLOW) && (Err != ERROR_INSUFFICIENT_BUFFER)) {
+				SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() sizing failed with error code %d, AdapterAddrsSize = %d\n", Err, AdapterAddrsSize);
+				mtu = -1;
+				goto cleanup;
+			}
+		}
+		if ((pAdapterAddrs = (PIP_ADAPTER_ADDRESSES) GlobalAlloc(GPTR, AdapterAddrsSize)) == NULL) {
+			SCTPDBG(SCTP_DEBUG_USR, "Memory allocation error!\n");
+			mtu = -1;
 			goto cleanup;
 		}
-	}
-	if ((pAdapterAddrs = (PIP_ADAPTER_ADDRESSES) GlobalAlloc(GPTR, AdapterAddrsSize)) == NULL) {
-		SCTPDBG(SCTP_DEBUG_USR, "Memory allocation error!\n");
-		ret = -1;
-		goto cleanup;
-	}
-	if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, pAdapterAddrs, &AdapterAddrsSize)) != ERROR_SUCCESS) {
-		SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() failed with error code %d\n", Err);
-		ret = -1;
-		goto cleanup;
-	}
-	for (pAdapt = pAdapterAddrs; pAdapt; pAdapt = pAdapt->Next) {
-		if (pAdapt->IfIndex == if_index) {
-			ret = pAdapt->Mtu;
-			break;
+		if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, pAdapterAddrs, &AdapterAddrsSize)) != ERROR_SUCCESS) {
+			SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() failed with error code %d\n", Err);
+			mtu = -1;
+			goto cleanup;
 		}
+		for (pAdapt = pAdapterAddrs; pAdapt; pAdapt = pAdapt->Next) {
+			if (pAdapt->IfIndex == if_index) {
+				mtu = pAdapt->Mtu;
+				break;
+			}
+		}
+	cleanup:
+		if (pAdapterAddrs != NULL) {
+			GlobalFree(pAdapterAddrs);
+		}
+		break;
+#endif
+	case AF_CONN:
+		mtu = 1280;
+		break;
+	default:
+		mtu = 0;
+		break;
 	}
-cleanup:
-	if (pAdapterAddrs != NULL) {
-		GlobalFree(pAdapterAddrs);
-	}
-	return (ret);
+	return (mtu);
 }
 
 void
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c
index e5fba96717..e8cf78017a 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c
@@ -34,7 +34,7 @@
 
 #if defined(__FreeBSD__) && !defined(__Userspace__)
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_usrreq.c 366750 2020-10-16 10:44:48Z tuexen $");
+__FBSDID("$FreeBSD$");
 #endif
 
 #include <netinet/sctp_os.h>
@@ -974,29 +974,29 @@ sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE))) {
 		goto connected_type;
-	} else if (addr == NULL) {
+	}
+
+	error = 0;
+	if (addr == NULL) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
 		error = EDESTADDRREQ;
-		sctp_m_freem(m);
-		if (control) {
-			sctp_m_freem(control);
-			control = NULL;
-		}
-		return (error);
+	} else if (addr->sa_family != AF_INET) {
+		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT);
+		error = EAFNOSUPPORT;
+#if defined(HAVE_SA_LEN)
+	} else if (addr->sa_len != sizeof(struct sockaddr_in)) {
+		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+		error = EINVAL;
+#endif
 	}
-#ifdef INET6
-	if (addr->sa_family != AF_INET) {
-		/* must be a v4 address! */
-		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
+	if (error != 0) {
 		sctp_m_freem(m);
 		if (control) {
 			sctp_m_freem(control);
 			control = NULL;
 		}
-		error = EDESTADDRREQ;
 		return (error);
 	}
-#endif				/* INET6 */
 connected_type:
 	/* now what about control */
 	if (control) {
@@ -6112,6 +6112,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 			return (EINVAL);
 		}
 		if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) &&
+		    (paddrp->spp_pathmtu > 0) &&
 		    ((paddrp->spp_pathmtu < SCTP_SMALLEST_PMTU) ||
 		     (paddrp->spp_pathmtu > SCTP_LARGEST_PMTU))) {
 			if (stcb)
@@ -6156,28 +6157,30 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 								SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11);
 					}
 					net->dest_state |= SCTP_ADDR_NO_PMTUD;
-					net->mtu = paddrp->spp_pathmtu;
-					switch (net->ro._l_addr.sa.sa_family) {
+					if (paddrp->spp_pathmtu > 0) {
+						net->mtu = paddrp->spp_pathmtu;
+						switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
-					case AF_INET:
-						net->mtu += SCTP_MIN_V4_OVERHEAD;
-						break;
+						case AF_INET:
+							net->mtu += SCTP_MIN_V4_OVERHEAD;
+							break;
 #endif
 #ifdef INET6
-					case AF_INET6:
-						net->mtu += SCTP_MIN_OVERHEAD;
-						break;
+						case AF_INET6:
+							net->mtu += SCTP_MIN_OVERHEAD;
+							break;
 #endif
 #if defined(__Userspace__)
-					case AF_CONN:
-						net->mtu += sizeof(struct sctphdr);
-						break;
+						case AF_CONN:
+							net->mtu += sizeof(struct sctphdr);
+							break;
 #endif
-					default:
-						break;
-					}
-					if (net->mtu < stcb->asoc.smallest_mtu) {
-						sctp_pathmtu_adjustment(stcb, net->mtu);
+						default:
+							break;
+						}
+						if (net->mtu < stcb->asoc.smallest_mtu) {
+							sctp_pathmtu_adjustment(stcb, net->mtu);
+						}
 					}
 				}
 				if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
@@ -6186,7 +6189,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 					}
 					net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 				}
-				if (paddrp->spp_pathmaxrxt) {
+				if (paddrp->spp_pathmaxrxt > 0) {
 					if (net->dest_state & SCTP_ADDR_PF) {
 						if (net->error_count > paddrp->spp_pathmaxrxt) {
 							net->dest_state &= ~SCTP_ADDR_PF;
@@ -6229,7 +6232,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 #endif
 			} else {
 				/************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/
-				if (paddrp->spp_pathmaxrxt != 0) {
+				if (paddrp->spp_pathmaxrxt > 0) {
 					stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt;
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						if (net->dest_state & SCTP_ADDR_PF) {
@@ -6261,7 +6264,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 						net->failure_threshold = paddrp->spp_pathmaxrxt;
 					}
 				}
-
 				if (paddrp->spp_flags & SPP_HB_ENABLE) {
 					if (paddrp->spp_hbinterval != 0) {
 						stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval;
@@ -6304,31 +6306,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 									SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16);
 						}
 						net->dest_state |= SCTP_ADDR_NO_PMTUD;
-						net->mtu = paddrp->spp_pathmtu;
-						switch (net->ro._l_addr.sa.sa_family) {
+						if (paddrp->spp_pathmtu > 0) {
+							net->mtu = paddrp->spp_pathmtu;
+							switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
-						case AF_INET:
-							net->mtu += SCTP_MIN_V4_OVERHEAD;
-							break;
+							case AF_INET:
+								net->mtu += SCTP_MIN_V4_OVERHEAD;
+								break;
 #endif
 #ifdef INET6
-						case AF_INET6:
-							net->mtu += SCTP_MIN_OVERHEAD;
-							break;
+							case AF_INET6:
+								net->mtu += SCTP_MIN_OVERHEAD;
+								break;
 #endif
 #if defined(__Userspace__)
-						case AF_CONN:
-							net->mtu += sizeof(struct sctphdr);
-							break;
+							case AF_CONN:
+								net->mtu += sizeof(struct sctphdr);
+								break;
 #endif
-						default:
-							break;
-						}
-						if (net->mtu < stcb->asoc.smallest_mtu) {
-							sctp_pathmtu_adjustment(stcb, net->mtu);
+							default:
+								break;
+							}
+							if (net->mtu < stcb->asoc.smallest_mtu) {
+								sctp_pathmtu_adjustment(stcb, net->mtu);
+							}
 						}
 					}
-					stcb->asoc.default_mtu = paddrp->spp_pathmtu;
+					if (paddrp->spp_pathmtu > 0) {
+						stcb->asoc.default_mtu = paddrp->spp_pathmtu;
+					}
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 				}
 				if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
@@ -6374,7 +6380,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 				 * For the TOS/FLOWLABEL stuff you set it
 				 * with the options on the socket
 				 */
-				if (paddrp->spp_pathmaxrxt != 0) {
+				if (paddrp->spp_pathmaxrxt > 0) {
 					inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt;
 				}
 
@@ -6400,7 +6406,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
 					inp->sctp_ep.default_mtu = 0;
 					sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 				} else if (paddrp->spp_flags & SPP_PMTUD_DISABLE) {
-					inp->sctp_ep.default_mtu = paddrp->spp_pathmtu;
+					if (paddrp->spp_pathmtu > 0) {
+						inp->sctp_ep.default_mtu = paddrp->spp_pathmtu;
+					}
 					sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 				}
 				if (paddrp->spp_flags & SPP_DSCP) {
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c
index 79838e40da..639b36f307 100755
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c
@@ -4837,7 +4837,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
  * if there is return 1, else return 0.
  */
 int
-sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t * vtagfill)
+sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t *vtag)
 {
 	struct sctp_chunkhdr *ch;
 	struct sctp_init_chunk *init_chk, chunk_buf;
@@ -4858,12 +4858,13 @@ sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t * vtagfill)
 			/* yep, tell them */
 			return (1);
 		}
-		if (ch->chunk_type == SCTP_INITIATION) {
+		if ((ch->chunk_type == SCTP_INITIATION) ||
+		    (ch->chunk_type == SCTP_INITIATION_ACK)) {
 			/* need to update the Vtag */
 			init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m,
-			    offset, sizeof(*init_chk), (uint8_t *) & chunk_buf);
+			    offset, sizeof(struct sctp_init_chunk), (uint8_t *) & chunk_buf);
 			if (init_chk != NULL) {
-				*vtagfill = ntohl(init_chk->init.initiate_tag);
+				*vtag = ntohl(init_chk->init.initiate_tag);
 			}
 		}
 		/* Nope, move to the next chunk */
diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c
index 5a931dd5a2..aa0c0051a5 100644
--- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c
+++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c
@@ -34,7 +34,7 @@
 
 #if defined(__FreeBSD__) && !defined(__Userspace__)
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet6/sctp6_usrreq.c 365071 2020-09-01 21:19:14Z mjg $");
+__FBSDID("$FreeBSD$");
 #endif
 
 #include <netinet/sctp_os.h>
@@ -259,13 +259,14 @@ sctp6_input(struct mbuf **i_pak, int *offp, int proto)
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		goto out;
 	}
-	ecn_bits = ((ntohl(ip6->ip6_flow) >> 20) & 0x000000ff);
 #if defined(__FreeBSD__)
+	ecn_bits = IPV6_TRAFFIC_CLASS(ip6);
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) {
 		SCTP_STAT_INCR(sctps_recvhwcrc);
 		compute_crc = 0;
 	} else {
 #else
+	ecn_bits = ((ntohl(ip6->ip6_flow) >> 20) & 0x000000ff);
 	if (SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) &&
 	    (IN6_ARE_ADDR_EQUAL(&src.sin6_addr, &dst.sin6_addr))) {
 		SCTP_STAT_INCR(sctps_recvhwcrc);
@@ -654,9 +655,10 @@ out:
 	return (error);
 }
 
-SYSCTL_PROC(_net_inet6_sctp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW,
-    0, 0,
-    sctp6_getcred, "S,ucred", "Get the ucred of a SCTP6 connection");
+SYSCTL_PROC(_net_inet6_sctp6, OID_AUTO, getcred,
+    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+    0, 0, sctp6_getcred, "S,ucred",
+    "Get the ucred of a SCTP6 connection");
 #endif
 
 /* This is the same as the sctp_abort() could be made common */
@@ -1007,6 +1009,46 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *nam,
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EDESTADDRREQ);
 		return (EDESTADDRREQ);
 	}
+	switch (addr->sa_family) {
+#ifdef INET
+	case AF_INET:
+#if defined(HAVE_SA_LEN)
+		if (addr->sa_len != sizeof(struct sockaddr_in)) {
+			if (control) {
+				SCTP_RELEASE_PKT(control);
+				control = NULL;
+			}
+			SCTP_RELEASE_PKT(m);
+			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
+			return (EINVAL);
+		}
+#endif
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+#if defined(HAVE_SA_LEN)
+		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
+			if (control) {
+				SCTP_RELEASE_PKT(control);
+				control = NULL;
+			}
+			SCTP_RELEASE_PKT(m);
+			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
+			return (EINVAL);
+		}
+#endif
+		break;
+#endif
+	default:
+		if (control) {
+			SCTP_RELEASE_PKT(control);
+			control = NULL;
+		}
+		SCTP_RELEASE_PKT(m);
+		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
+		return (EINVAL);
+	}
 #ifdef INET
 	sin6 = (struct sockaddr_in6 *)addr;
 	if (SCTP_IPV6_V6ONLY(inp)) {
@@ -1015,10 +1057,20 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *nam,
 		 * v4 addr or v4-mapped addr
 		 */
 		if (addr->sa_family == AF_INET) {
+			if (control) {
+				SCTP_RELEASE_PKT(control);
+				control = NULL;
+			}
+			SCTP_RELEASE_PKT(m);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			if (control) {
+				SCTP_RELEASE_PKT(control);
+				control = NULL;
+			}
+			SCTP_RELEASE_PKT(m);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}